In [15]:
import re
import csv
import nltk
import numpy
import sklearn
import pandas as pd
from pandas import Series
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.stem import RegexpStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Load Data
train = pd.read_json("../../data/RelatedVsNotRelated.json")
train2 = pd.read_json("../../data/AwarenessVsInfection.json")
train3 = pd.read_json("../../data/SelfVsOthers.json")

# Load Most Common Words
most_common  = pd.read_csv("../Experiment02/most_common/related.csv")
most_common2 = pd.read_csv("../Experiment02/most_common/infection2.csv")
most_common3 = pd.read_csv("../Experiment02/most_common/self.csv")

In [16]:
# Tokenize data
related = word_tokenize(' '.join(most_common['Word']))
infection = word_tokenize(' '.join(most_common2['Word']))
self = word_tokenize(' '.join(most_common3['Word']))

## Methods

In [17]:
# Define Word Stops
stopset = set(stopwords.words('english'))
morewords = ['who','which', 'I\'m','\'m']
stopset.update(morewords)

# Remove URLs, RTs, and twitter handles
def clean_data(text):
    # Remove unicode characters
    text = text.replace('[^\x00-\x7F]','')
    words = [text for text in text.split() if 'http' not in text and not text.startswith('@') and text != 'RT']
    return ' '.join(words)

# Text to Lower Case
def text_to_lower(text):
    return text.lower()

# Remove some characters
def remove_special_characters(text):
    bad_chars = '#?(){}<>:;.!$%&/=+*^-'
    rgx = re.compile('[%s]' % bad_chars)
    return rgx.sub('', text)

# Create a set of Stopwords
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stopset]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stopset:
            filtered_sentence.append(w)

    return ' '.join(filtered_sentence)

# Stemming words
def stem_words(text):
    words = word_tokenize(text)
    #Regex for Suffixes
    st = RegexpStemmer('ing$|s$|e$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4)
    stemmed = []
    for word in words:
        stemmed.append(st.stem(word))
    return ' '.join(stemmed)


def clean_text(df):
    for i, row in df.iterrows():
      cleaned_text = row['text']
      cleaned_text= clean_data(cleaned_text)
      cleaned_text= text_to_lower(cleaned_text)
      cleaned_text= remove_special_characters(cleaned_text)
      cleaned_text= remove_stopwords(cleaned_text)
      cleaned_text= stem_words(cleaned_text)
      df.set_value(i,'text',cleaned_text)
    return df

def create_cvs(text, name_file, number):

    with open('./most_common/'+name_file+'.csv', 'w') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [['Word', 'Frequency']]
        # Calculate frequency distribution
        fdist = nltk.FreqDist(text)
        for word, frequency in fdist.most_common(number):
            data.append([word, frequency])
        a.writerows(data)

In [18]:
# Clean text on my Dataframe
train = clean_text(train)
train2 = clean_text(train2)
train3 = clean_text(train3)

In [19]:
cv_related = sklearn.feature_extraction.text.CountVectorizer(vocabulary=related)
list_related = train['text'].tolist()

cv_infection = sklearn.feature_extraction.text.CountVectorizer(vocabulary=infection)
list_infection = train2['text'].tolist()

cv_self = sklearn.feature_extraction.text.CountVectorizer(vocabulary=self)
list_self = train3['text'].tolist()

In [20]:
array_related = cv_related.fit_transform(list_related).toarray()
#numpy.savetxt("related.csv", numpy.asarray(array_related), fmt='%i', delimiter=",")

array_infection = cv_infection.fit_transform(list_infection).toarray()
numpy.savetxt("infection2.csv", numpy.asarray(array_infection), fmt='%i', delimiter=",")

array_self = cv_self.fit_transform(list_self).toarray()
#numpy.savetxt("self.csv", numpy.asarray(array_self), fmt='%i', delimiter=",")



In [14]:
infection

['sick',
 'feel',
 'got',
 'lik',
 'go',
 'im',
 'get',
 'still',
 'hom',
 'good',
 'cold',
 'work',
 'today',
 'cough',
 'bad',
 'sore',
 'nose',
 'runny',
 'manflu',
 'flu']

In [102]:
foo =  pd.read_csv("related.csv")
foo['RESULT'] = Series(train['type'], index=foo.index)
foo['ID'] = Series(train['id'], index=foo.index)
foo.to_csv('./data_vectorised/related.csv',sep=',')

In [21]:
foo =  pd.read_csv("infection2.csv")
foo['RESULT'] = Series(train['type'], index=foo.index)
foo['ID'] = Series(train['id'], index=foo.index)
foo.to_csv('./data_vectorised/infection2.csv',sep=',')

In [109]:
foo =  pd.read_csv("self.csv")
foo['RESULT'] = Series(train['type'], index=foo.index)
foo['ID'] = Series(train['id'], index=foo.index)
foo.to_csv('./data_vectorised/self.csv',sep=',')