In [13]:
import re
import nltk
import numpy as np
import sklearn
import pandas as pd
from patsy import dmatrices
from scikitplot import plotters as skplt
import matplotlib.pyplot as plt
from pandas import Series
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from nltk.stem import RegexpStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, classification_report

train3 = pd.read_json("../../data/RelatedVsNotRelated.json")
train2 = pd.read_json("../../data/AwarenessVsInfection.json")
train = pd.read_json("../../data/SelfVsOthers.json")

# ## Related Vs Not Related:
#  0: Not related to influenza
#  1: Related to influenza
train_notrelated = train3.loc[train3['type'] == 0]

# ## Awareness Vs Infection
# 0: Influenza infection
# 1: Influenza awareness
train_awareness = train2.loc[train2['type'] == 1]

# ## Self Vs Others
# 0: Others (the tweet describes someone else)
# 1: Self (the tweet describes the author)
train_others = train.loc[train['type'] == 0]
train_self = train.loc[train['type'] == 1]

# Define Word Stops
stopset = set(stopwords.words('english'))
morewords = ["'s", "swine", "bird", "h1n1", "'ve", "lol", "pig"]
stopset.update(morewords)
#Remove word from stopword list
itemsToRemove = ['can','am', 'are', 're', 'm','have','has','i', 'you', 'he', 'she', 'we', 'they']
stopset = [x for x in stopset if x not in itemsToRemove]

#Methods
# Remove URLs, RTs, and twitter handles
def clean_data(text):
    #text= text.decode('utf-8')
    text = text.replace('[^\x00-\x7F]','')
    words = [text for text in text.split() if 'http' not in text and not text.startswith('@') and text != 'RT']
    return ' '.join(words)

# Text to Lower Case
def text_to_lower(text):
    return text.lower()

# Remove some characters
def remove_special_characters(text):
    bad_chars = '-#?(){}<>:;.!$%&/=+*^-`\'0123456789'
    rgx = re.compile('[%s]' % bad_chars)
    return rgx.sub('', text)

# Create a set of Stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stopset]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stopset:
            filtered_sentence.append(w)

    return ' '.join(filtered_sentence)

# Stemming words
def stem_words(text):
    words = word_tokenize(text)
    #Regex for Suffixes
    st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4)
    stemmed = []
    for word in words:
        stemmed.append(st.stem(word))
    return ' '.join(stemmed)


def clean_text(df):
    for i, row in df.iterrows():
      cleaned_text = row['text']
      cleaned_text= clean_data(cleaned_text)
      cleaned_text= text_to_lower(cleaned_text)
      cleaned_text= remove_special_characters(cleaned_text)
      cleaned_text= remove_stopwords(cleaned_text)
      cleaned_text= stem_words(cleaned_text)
      df.set_value(i,'text',cleaned_text)
    return df

def create_wordcloud(list_words, name_cloud):
    wordcloud = WordCloud(
                      stopwords= stopset,
                      background_color='black',
                      width=1800,
                      height=1400
                     ).generate(list_words)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.savefig('./wordclouds/'+name_cloud, dpi=300)
    plt.show()

def print_frequency(words, number):
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    for word, frequency in fdist.most_common(number):
        print('{},'.format(word))

In [36]:
# # Logistic Regression
#data = pd.read_csv("./data_vectorised/data.csv")
data = pd.read_csv("./data_vectorised/improved.csv")
y, X = dmatrices("RESULT ~ flu + gett + i + im  + he + school + sleep + dr + catch + make + tomorrow + since + damn + bit + great + keep + h + tired + first + soon + everyone + away + head + thought + someth + ready + next + start + com + fuck + may + little + anyone + lot + body + doct + could + long + god + seem + night + man + care + ok + done + look + stay + weekend + say + eith + nose + isnt + tonight + tell + office + regular + hand + shit + enough + come + rest + mask + help + please + fun + stomach + would + re + yes + sure + stupid + viru + nervou + due + crap + tak + cause + l + hopefully + life + old + wond + yeah + hell + health + woke + disease + clinic + every + must + suck + many + someone + actually + least + headache + kind + nas + concerned + havent + nev + hurt + youre + love + girl + friend + read + hour + hard + b + also + bc + us + anyth + g + ch", data, return_type = 'dataframe')
# flatten y into a 1-D array
y = np.ravel(y)

# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression(fit_intercept=False)
model = model.fit(X,y)


# check the accuracy on the training set
model.score(X, y)

# Model Evaluation Using a Validation Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
model2 = LogisticRegression(fit_intercept=False)
model2.fit(X_train, y_train)

# predict class labels for the test set
predicted = model2.predict(X_test)
#print predicted
y_score = model2.fit(X_train, y_train).decision_function(X_test)


# generate class probabilities
probs = model2.predict_proba(X_test)
#print probs

# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])


print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

0.770440251572
0.842045195161
[[247  90]
 [ 56 243]]
             precision    recall  f1-score   support

        0.0       0.82      0.73      0.77       337
        1.0       0.73      0.81      0.77       299

avg / total       0.78      0.77      0.77       636



In [33]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
#Persistence Model
from sklearn.externals import joblib
joblib.dump(model, './modelLogistic.pkl')


['./modelLogistic.pkl']

In [8]:
#ROC Curve
probas = model.predict_proba(X)
skplt.plot_roc_curve(y_true=y, y_probas=probas)
plt.show()
# Confusion Matrix
preds = model.predict(X)
skplt.plot_confusion_matrix(y_true=y, y_pred=preds)
plt.show()
#Learning Curve
#skplt.plot_learning_curve(model, X, y)
#plt.show()

#Precision Recall Curve
skplt.plot_precision_recall_curve(y_true=y, y_probas=probas)
#plt.show()

In [18]:
# Concat three training datasets
# Change Awareness value 1 to 0

'''
train_awareness.loc[train_awareness.type == 1, 'type'] = 0
frames = [train_notrelated, train_awareness, train_self]
result = pd.concat(frames)
#Drop Duplicate
result = result.drop_duplicates(subset=['text'], keep=False)
# Unique ID
#len(result.text.unique())
header= ['id', 'text', 'type']
'''
#result.to_csv('train.csv',index=False, sep=',',columns= header,encoding='utf-8')
train_all= pd.read_json("./trainall.json")
train_all

Unnamed: 0,id,text,type
0,afebbe96529239db53364ca1423fbe0f3b3d88d97892cb...,Central Coast Mariners CEO says having Dyldam ...,0
1,b884772e74bb8ac903e764b676a769761369d3c7f1baea...,Willie Nelson's original Crazy was written a...,0
2,7cfcdc777a9366b9f146c195af51c34eeb9d2c37b2ebaa...,- The Greens &amp; other lefties guilty as s...,0
3,0474458ceffa52e9fe3d4e50deb8613d0b1e7569eada99...,"Hello Erin welcome to Adelaide, I am in awe o...",0
4,5496f87b8033a6b3f4fe54a506bc2d5196fc7aab75de21...,Lake St Clair is beautiful. I would recommend ...,0
5,5ac5a0f617f4e4f422d9031a6a560bfe3a80d2d7097a31...,We certainly need to do something but how fa...,0
6,403d3613dd6608820e9f282e03be3c27d56ed4ba7ece74...,Global deal reached to limit use of hydrofluor...,0
7,4994c2ee4f917a8a204e959267632eaf6d590596136f56...,finish your homework and go to bed 😆,0
8,7e928e18f8309b9f3ba89abaa884c2e2f66672696a6a5e...,When you can't afford life so you just cry,0
9,0abfa5588cc3dd0ab3816995644953f45f621b114d3a75...,"""Cool ridings"" from Sunday morning, I think th...",0


In [3]:
#Clean text on my Dataframe
#train_all = clean_text(train_all)
#Vectorisation
#most_common  = pd.read_csv("./predictors.csv")
#vocabulary = word_tokenize(' '.join(most_common))

#cv_all = sklearn.feature_extraction.text.CountVectorizer(vocabulary=vocabulary)
#list_all = train_all['text'].tolist()

#array_all = cv_all.fit_transform(list_all).toarray()
# Create CSV file
#np.savetxt("./data_vectorised/all_improved.csv", np.asarray(array_all.astype(int)), fmt='%i', delimiter=",")
#foo =  pd.read_csv("./data_vectorised/all_improved.csv")
#foo['RESULT'] = Series(train_all['type'], index=foo.index)
#foo['ID'] = Series(train_all['id'], index=foo.index)
#foo.to_csv('./data_vectorised/improved.csv',sep=',',index=False)