In [101]:
import re
import nltk
import numpy as np
import sklearn
import pandas as pd
from patsy import dmatrices
from scikitplot import plotters as skplt
import matplotlib.pyplot as plt
from pandas import Series
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from nltk.stem import RegexpStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, classification_report

'''
# read the entire file into a python array
with open('../../data/tweetDB-AU-15-Oct.json', 'rb') as f:
    data = f.readlines()
# remove the trailing "\n" from each line
#with open('./random_sample.json', 'rb') as f:
#    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
# now, load it into pandas
sample = pd.read_json(data_json_str)
'''

train3 = pd.read_json("../../data/RelatedVsNotRelated.json")
train2 = pd.read_json("../../data/AwarenessVsInfection.json")
train = pd.read_json("../../data/SelfVsOthers.json")

# ## Awareness Vs Infection
# 0: Influenza infection
# 1: Influenza awareness
train_infection = train2.loc[train2['type'] == 0]
train_awareness = train2.loc[train2['type'] == 1]

# ## Self Vs Others
# 0: Others (the tweet describes someone else)
# 1: Self (the tweet describes the author)
train_others = train.loc[train['type'] == 0]
train_self = train.loc[train['type'] == 1]

# Define Word Stops
stopset = set(stopwords.words('english'))
morewords = ["'s", "swine", "bird", "h1n1", "'ve", "lol", "pig"]
stopset.update(morewords)
#Remove word from stopword list
itemsToRemove = ['can','am', 'are', 're', 'm','have','has','i', 'you', 'he', 'she', 'we', 'they']
stopset = [x for x in stopset if x not in itemsToRemove]

#Methods
# Remove URLs, RTs, and twitter handles
def clean_data(text):
    text= text.decode('utf-8')
    text = text.replace('[^\x00-\x7F]','')
    words = [text for text in text.split() if 'http' not in text and not text.startswith('@') and text != 'RT']
    return ' '.join(words)

# Text to Lower Case
def text_to_lower(text):
    return text.lower()

# Remove some characters
def remove_special_characters(text):
    bad_chars = '-#?(){}<>:;.!$%&/=+*^-`\'0123456789'
    rgx = re.compile('[%s]' % bad_chars)
    return rgx.sub('', text)

# Create a set of Stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stopset]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stopset:
            filtered_sentence.append(w)

    return ' '.join(filtered_sentence)

# Stemming words
def stem_words(text):
    words = word_tokenize(text)
    #Regex for Suffixes
    st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4)
    stemmed = []
    for word in words:
        stemmed.append(st.stem(word))
    return ' '.join(stemmed)


def clean_text(df):
    for i, row in df.iterrows():
      cleaned_text = row['text']
      cleaned_text= clean_data(cleaned_text)
      cleaned_text= text_to_lower(cleaned_text)
      cleaned_text= remove_special_characters(cleaned_text)
      cleaned_text= remove_stopwords(cleaned_text)
      cleaned_text= stem_words(cleaned_text)
      df.set_value(i,'text',cleaned_text)
    return df

def create_wordcloud(list_words, name_cloud):
    wordcloud = WordCloud(
                      stopwords= stopset,
                      background_color='black',
                      width=1800,
                      height=1400
                     ).generate(list_words)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.savefig('./wordclouds/'+name_cloud, dpi=300)
    plt.show()

def print_frequency(words, number):
    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    for word, frequency in fdist.most_common(number):
        print('{},'.format(word))

In [102]:
#len(data_df)
'''
sample = sample.sample(n=2000,random_state=1)
data_df= sample[['id_tweet', 'text']]
data_df['type'] = pd.Series(0, index=data_df.index)
data_df= data_df.rename(columns={'id_tweet':'id'})
data_df
out = data_df.to_json(orient='records')
with open('./random_sample.json', 'w') as f:
    f.write(out)
'''
#data_df.to_csv('./random_sample.csv',header ='id, text, RESULT', index=False, encoding='utf-8')

"\nsample = sample.sample(n=2000,random_state=1)\ndata_df= sample[['id_tweet', 'text']]\ndata_df['type'] = pd.Series(0, index=data_df.index)\ndata_df= data_df.rename(columns={'id_tweet':'id'})\ndata_df\nout = data_df.to_json(orient='records')\nwith open('./random_sample.json', 'w') as f:\n    f.write(out)\n"

In [103]:
random_sample = pd.read_json("./random_sample.json")
random_sample.head()

Unnamed: 0,id,text,type
0,afebbe96529239db53364ca1423fbe0f3b3d88d97892cb...,Central Coast Mariners CEO says having Dyldam ...,0
1,b884772e74bb8ac903e764b676a769761369d3c7f1baea...,Willie Nelson's original Crazy was written a...,0
2,7cfcdc777a9366b9f146c195af51c34eeb9d2c37b2ebaa...,- The Greens &amp; other lefties guilty as s...,0
3,0474458ceffa52e9fe3d4e50deb8613d0b1e7569eada99...,"Hello Erin welcome to Adelaide, I am in awe o...",0
4,5496f87b8033a6b3f4fe54a506bc2d5196fc7aab75de21...,Lake St Clair is beautiful. I would recommend ...,0


In [107]:
train_awareness.loc[train_awareness.type == 1, 'type'] = 0
train_infection.loc[train_infection.type == 0, 'type'] = 1

print '(0)train not related(random sample):',len(random_sample)
print '(0)train awareness:', len(train_awareness)
print '(1)train infection:',len(train_infection)
print '(1)train self:',len(train_self)
print '------------------------------------'
print '(0)train not related(random sample):',pd.value_counts(random_sample['type'].values, sort=False)
print '(0)train awareness:', pd.value_counts(train_awareness['type'].values, sort=False)
print '(1)train infection:',pd.value_counts(train_infection['type'].values, sort=False)
print '(1)train self:',pd.value_counts(train_self['type'].values, sort=False)

(0)train not related(random sample): 2000
(0)train awareness: 1417
(1)train infection: 1454
(1)train self: 1482
------------------------------------
(0)train not related(random sample): 0    2000
dtype: int64
(0)train awareness: 0    1417
dtype: int64
(1)train infection: 1    1454
dtype: int64
(1)train self: 1    1482
dtype: int64


In [108]:
# Concat 4 training datasets
# Change Awareness value 1 to 0
print '(0) not related(sample) + awareness:', (len(random_sample)+len(train_awareness))
print '(1) infection + self:', (len(train_infection)+len(train_self))

frames = [random_sample,train_awareness,train_infection, train_self]
result = pd.concat(frames)
#Drop Duplicate
#result = result.drop_duplicates(subset=['id'], keep=False)
# Unique ID
#len(result.text.unique())
out = result.to_json(orient='records')
with open('./trainall.json', 'w') as f:
    f.write(out)

dataset = pd.read_json('./trainall.json')
dataset
#result.to_csv('train.csv',index=False, sep=',',columns= header,encoding='utf-8')
#train_all= pd.read_csv("./trainall.csv")
#train_all.head()

(0) not related(sample) + awareness: 3417
(1) infection + self: 2936


Unnamed: 0,id,text,type
0,afebbe96529239db53364ca1423fbe0f3b3d88d97892cb...,Central Coast Mariners CEO says having Dyldam ...,0
1,b884772e74bb8ac903e764b676a769761369d3c7f1baea...,Willie Nelson's original Crazy was written a...,0
2,7cfcdc777a9366b9f146c195af51c34eeb9d2c37b2ebaa...,- The Greens &amp; other lefties guilty as s...,0
3,0474458ceffa52e9fe3d4e50deb8613d0b1e7569eada99...,"Hello Erin welcome to Adelaide, I am in awe o...",0
4,5496f87b8033a6b3f4fe54a506bc2d5196fc7aab75de21...,Lake St Clair is beautiful. I would recommend ...,0
5,5ac5a0f617f4e4f422d9031a6a560bfe3a80d2d7097a31...,We certainly need to do something but how fa...,0
6,403d3613dd6608820e9f282e03be3c27d56ed4ba7ece74...,Global deal reached to limit use of hydrofluor...,0
7,4994c2ee4f917a8a204e959267632eaf6d590596136f56...,finish your homework and go to bed 😆,0
8,7e928e18f8309b9f3ba89abaa884c2e2f66672696a6a5e...,When you can't afford life so you just cry,0
9,0abfa5588cc3dd0ab3816995644953f45f621b114d3a75...,"""Cool ridings"" from Sunday morning, I think th...",0


In [111]:
print pd.value_counts(dataset['type'].values, sort=False)
print len(dataset)

0    3417
1    2936
dtype: int64
6353


In [112]:
#dataset.loc[dataset['type'] == 4786269900]
