In [15]:
import re
import nltk
import numpy as np
import sklearn
import pandas as pd
from patsy import dmatrices
from scikitplot import plotters as skplt
import matplotlib.pyplot as plt
from pandas import Series
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from nltk.stem import RegexpStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn import datasets
from sklearn.metrics import classification_report

train = pd.read_json("../../data/SelfVsOthers.json")
train_others = train.loc[train['type'] == 0]
train_self = train.loc[train['type'] == 1]

# Define Word Stops
stopset = set(stopwords.words('english'))
morewords = ["'s", "swine", "bird", "h1n1", "'ve", "lol", "pig"]
stopset.update(morewords)
#Remove word from stopword list
itemsToRemove = ['can','am', 'are', 're', 'm','have','has','i', 'you', 'he', 'she', 'we', 'they']
stopset = [x for x in stopset if x not in itemsToRemove]

In [16]:
#Methods
# Remove URLs, RTs, and twitter handles
def clean_data(text):
    text = text.replace('[^\x00-\x7F]','')
    words = [text for text in text.split() if 'http' not in text and not text.startswith('@') and text != 'RT']
    return ' '.join(words)

# Text to Lower Case
def text_to_lower(text):
    return text.lower()

# Remove some characters
def remove_special_characters(text):
    bad_chars = '-#?(){}<>:;.!$%&/=+*^-`\'0123456789'
    rgx = re.compile('[%s]' % bad_chars)
    return rgx.sub('', text)

# Create a set of Stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stopset]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stopset:
            filtered_sentence.append(w)

    return ' '.join(filtered_sentence)

# Stemming words
def stem_words(text):
    words = word_tokenize(text)
    #Regex for Suffixes
    st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4)
    stemmed = []
    for word in words:
        stemmed.append(st.stem(word))
    return ' '.join(stemmed)


def clean_text(df):
    for i, row in df.iterrows():
      cleaned_text = row['text']
      cleaned_text= clean_data(cleaned_text)
      cleaned_text= text_to_lower(cleaned_text)
      cleaned_text= remove_special_characters(cleaned_text)
      cleaned_text= remove_stopwords(cleaned_text)
      cleaned_text= stem_words(cleaned_text)
      df.set_value(i,'text',cleaned_text)
    return df

def create_wordcloud(list_words, name_cloud):
    wordcloud = WordCloud(
                      stopwords= stopset,
                      background_color='black',
                      width=1800,
                      height=1400
                     ).generate(list_words)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.savefig('./wordclouds/'+name_cloud, dpi=300)
    plt.show()

def print_frequency(words, number):
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    for word, frequency in fdist.most_common(number):
        print('{},'.format(word))



In [17]:
#Clean text on my Dataframe
train_others = clean_text(train_others)
train_self = clean_text(train_self)
# Clean text on my Dataframe
train = clean_text(train)
# Tokenizing DF
list_words = nltk.tokenize.word_tokenize(' '.join(train_self['text']))
#print_frequency(list_words,200)

In [18]:
#Vectorisation
most_common  = pd.read_csv("./predictors.csv")
self = word_tokenize(' '.join(most_common))

cv_self = sklearn.feature_extraction.text.CountVectorizer(vocabulary=self)
list_self = train['text'].tolist()

array_self = cv_self.fit_transform(list_self).toarray()
# Create CSV file
#numpy.savetxt("self.csv", np.asarray(array_self.astype(int)), fmt='%i', delimiter=",")

In [19]:
#foo =  pd.read_csv("self.csv")
#foo['RESULT'] = Series(train['type'], index=foo.index)
#foo['ID'] = Series(train['id'], index=foo.index)
#foo.to_csv('./data_vectorised/self.csv',sep=',')

# Logistic Regression

In [20]:
data = pd.read_csv("./data_vectorised/self.csv")
y, X = dmatrices("RESULT ~ flu + gett + i + im  + he + school + sleep + dr + catch + make + tomorrow + since + damn + bit + great + keep + h + tired + first + soon + everyone + away + head + thought + someth + ready + next + start + com + fuck + may + little + anyone + lot + body + doct + could + long + god + seem + night + man + care + ok + done + look + stay + weekend + say + eith + nose + isnt + tonight + tell + office + regular + hand + shit + enough + come + rest + mask + help + please + fun + stomach + would + re + yes + sure + stupid + viru + nervou + due + crap + tak + cause + l + hopefully + life + old + wond + yeah + hell + health + woke + disease + clinic + every + must + suck + many + someone + actually + least + headache + kind + nas + concerned + havent + nev + hurt + youre + love + girl + friend + read + hour + hard + b + also + bc + us + anyth + g + ch", data, return_type = 'dataframe')
# flatten y into a 1-D array
y = np.ravel(y)

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X,y)

# check the accuracy on the training set
model.score(X, y)

0.72587222473946533

In [21]:
# Model Evaluation Using a Validation Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
# predict class labels for the test set
predicted = model.predict(X_test)
#print predicted
y_score = model.fit(X_train, y_train).decision_function(X_test)

In [23]:
# generate class probabilities
probs = model.predict_proba(X_test)
#print probs

In [24]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

0.704374057315
0.732988537004


In [25]:
print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

[[ 65 147]
 [ 49 402]]
             precision    recall  f1-score   support

        0.0       0.57      0.31      0.40       212
        1.0       0.73      0.89      0.80       451

avg / total       0.68      0.70      0.67       663



In [None]:
#ROC Curve
probas = model.predict_proba(X)
skplt.plot_roc_curve(y_true=y, y_probas=probas)
#plt.show()
# Confusion Matrix
preds = model.predict(X)
skplt.plot_confusion_matrix(y_true=y, y_pred=preds)
#plt.show()
#Learning Curve
skplt.plot_learning_curve(model, X, y)
#plt.show()

#Precision Recall Curve
skplt.plot_precision_recall_curve(y_true=y, y_probas=probas)
plt.show()