In [153]:
import numpy as np
import pandas as pd
import os

# importing the data

In [154]:
train = pd.read_csv('tweets\\train.csv')
test = pd.read_csv('tweets\\test.csv')

# taking a look at the data

In [155]:
train.head(5)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [156]:
test.head(5)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [157]:
len(train)

31962

# splitting the training data into 2 sets for learning and testing our model

In [158]:
train_train = train.loc[:31962/0.75 -1]
train_test = train.loc[31962/3 :]

# importing NLP libraries and modules

In [159]:
import nltk as nlp
import string as s
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

In [160]:
eng_stop_words = sk.feature_extraction.stop_words.ENGLISH_STOP_WORDS.copy

# functions to remove punctuations , stop words and stemming the tokenized words in each tweet

In [161]:
def remove_punctuation(tweet):
    tweet = ' '.join(word.strip(s.punctuation) for word in tweet.split())
    tweet_ret = ' '.join(word.strip(s.digits) for word in tweet.split() if word.isalnum())
    return tweet_ret

In [162]:
#applying to train data splits first
cleaned_train_train = train_train['tweet'].apply(lambda x : remove_punctuation(x))
cleaned_train_test = train_test['tweet'].apply(lambda x : remove_punctuation(x))

In [163]:
#applying to full datasets of train and test
cleaned_tweets_train = train['tweet'].apply(lambda x : remove_punctuation(x))
cleaned_tweets_test = test['tweet'].apply(lambda x : remove_punctuation(x))

In [164]:
def remove_stop_words(tweet):
    tweet_ret = ' '.join(word for word in tweet.split() if word not in list(eng_stop_words()))
    return tweet_ret

In [165]:
#applying to train data splits first
cleaned_train_train = cleaned_train_train.apply(lambda x : remove_stop_words(x))
cleaned_train_test = cleaned_train_test.apply(lambda x : remove_stop_words(x))

In [166]:
#applying to full datasets of train and test
cleaned_tweets_train = cleaned_tweets_train.apply(lambda x : remove_stop_words(x))
cleaned_tweets_test = cleaned_tweets_test.apply(lambda x : remove_stop_words(x))

In [167]:
def apply_stemmer(tweet):
    tweet_ret = ' '.join(PorterStemmer().stem(word) for word in tweet.split())
    return tweet_ret

In [168]:
#applying to train data splits first
cleaned_train_train = cleaned_train_train.apply(lambda x : apply_stemmer(x))
cleaned_train_test = cleaned_train_test.apply(lambda x : apply_stemmer(x))

In [169]:
#applying to full datasets of train and test
cleaned_tweets_train = cleaned_tweets_train.apply(lambda x : apply_stemmer(x))
cleaned_tweets_test = cleaned_tweets_test.apply(lambda x : apply_stemmer(x))

# applying count vectorizer

In [170]:
#applying to train data splits first
cvector = CountVectorizer().fit(cleaned_train_train.get_values())
bow_train_train = cvector.transform(cleaned_train_train.get_values())
bow_train_test = cvector.transform(cleaned_train_test.get_values())

In [171]:
#applying to full datasets of train and test
cv = CountVectorizer().fit(cleaned_tweets_train.get_values())
bow_train = cv.transform(cleaned_tweets_train.get_values())
bow_test = cv.transform(cleaned_tweets_test.get_values())

# applying term-doument frequency

In [172]:
#applying to train data splits first
tfidftransform = TfidfTransformer().fit(bow_train_train)
tfidf_train_train = tfidftransform.transform(bow_train_train)
tfidf_train_test = tfidftransform.transform(bow_train_test)

In [173]:
#applying to full datasets of train and test
tfidf = TfidfTransformer().fit(bow_train)
messages_tfidf_train = tfidf.transform(bow_train)
messages_tfidf_test = tfidf.transform(bow_test)

# importing logistic regression modules and its learning matrices

In [174]:
from sklearn.linear_model import LogisticRegressionCV , LogisticRegression
from sklearn.metrics import f1_score , classification_report,confusion_matrix

In [175]:
#applying to train data splits first
classifier_train= LogisticRegressionCV(fit_intercept=True,solver='liblinear', max_iter = 500)
classifier_train.fit(tfidf_train_train,train_train['label'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=500,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [176]:
classifier_train.score(tfidf_train_test,train_test['label'])

0.99619861085038486

In [177]:
f1_score(y_pred=classifier_train.predict(tfidf_train_test),y_true=train_test['label'])

0.97243960530792783

In [178]:
classification_report(y_pred=classifier_train.predict(tfidf_train_test),y_true=train_test['label'])

'             precision    recall  f1-score   support\n\n          0       1.00      1.00      1.00     19814\n          1       0.99      0.96      0.97      1494\n\navg / total       1.00      1.00      1.00     21308\n'

In [179]:
confusion_matrix(y_pred=classifier_train.predict(tfidf_train_test),y_true=train_test['label'])

array([[19798,    16],
       [   65,  1429]])

In [180]:
#applying to full datasets of train and test
classifier= LogisticRegressionCV(fit_intercept=True,solver='liblinear', max_iter = 500)
classifier.fit(messages_tfidf_train,train['label'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=500,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [181]:
classifier.score(messages_tfidf_train,train['label'])

0.99590138289218444

In [182]:
test['label'] = list(classifier.predict(messages_tfidf_test))

In [183]:
submission = test[['id','label']]

In [185]:
submission.to_csv('tweets\\test_predictions.csv',index=None)