In [23]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import networkx as nx
import re

import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to C:\Users\Harrison
[nltk_data]     Lu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv('./dataset/sentiment140/data.csv', encoding='latin-1', header = None)
df.columns = ['sentiment', 'ID', 'date', 'query', 'username', 'text']

df = df.applymap(lambda s: s.lower() if type(s) == str else s)
df.head()

Unnamed: 0,sentiment,ID,date,query,username,text
0,0,1467810369,mon apr 06 22:19:45 pdt 2009,no_query,_thespecialone_,"@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,1467810672,mon apr 06 22:19:49 pdt 2009,no_query,scotthamilton,is upset that he can't update his facebook by ...
2,0,1467810917,mon apr 06 22:19:53 pdt 2009,no_query,mattycus,@kenichan i dived many times for the ball. man...
3,0,1467811184,mon apr 06 22:19:57 pdt 2009,no_query,ellectf,my whole body feels itchy and like its on fire
4,0,1467811193,mon apr 06 22:19:57 pdt 2009,no_query,karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
dfTrain = df.sample(frac = 0.8, random_state = 20)
dfTest = df.drop(dfTrain.index)

dfGraphModel = dfTrain.sample(frac = 0.5, random_state = 20)
dfDM = dfTrain.drop(dfGraphModel.index)

#subset of the dfTrain dataset, split into positive, negative, and neutral tweets
pos = dfGraphModel.loc[df['sentiment'] == 4]
neg = dfGraphModel.loc[df['sentiment'] == 0]
neut = dfGraphModel.loc[df['sentiment'] == 2]

posText = {line["ID"]: line["text"] for index, line in pos.iterrows()}
negText = {line["ID"]: line["text"] for index, line in neg.iterrows()}

posWords = [line.rstrip('\n') for line in open('./dataset/positive-words.txt') if line.split() and list(line)[0] != ';']
negWords = [line.rstrip('\n') for line in open('./dataset/negative-words.txt') if line.split() and list(line)[0] != ";"]

In [77]:
subjectTweetsDict = {} # key = index in original df and value = tweet info (sentiment, id, date, query, username, full tweet)
filteredTweets = {} # key = index in original df and value = filtered tweet
count = 0
for row in df.itertuples():
    if 'food' in row[6]:
        subjectTweetsDict[row[0]] = list(row)[1:7]
        word_tokens = str(row[6]).split() #split by white space
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        filtered_sentence = [] 
        i = 0
        while i < len(word_tokens):
            if '@' in word_tokens[i]: #Taking out handles from tweets
                i = i + 1
            elif word_tokens[i] not in stop_words:
                filtered_sentence.append(word_tokens[i])
            i = i + 1
        filteredTweets[row[0]] = filtered_sentence
#print(subjectTweetsDict)
#df2 = pd.DataFrame(data=subjectTweetsDict, columns = ['index', 'sentiment', 'id', 'date', 'query', 'username', 'tweet'])
dfSubject= pd.DataFrame.from_dict(subjectTweetsDict, orient = 'index', columns = ['sentiment', 'id', 'date', 'query', 'username', 'tweet'])
for key, value in filteredTweets.items():
    filteredString = ' '.join(value)
    dfSubject.at[key, "tweet"] = filteredString

dfSubject.tail()

Unnamed: 0,sentiment,id,date,query,username,tweet
1598507,4,2193188422,tue jun 16 08:06:59 pdt 2009,no_query,melisarenea,ughh hate wake early drink...aren't supposed s...
1598732,4,2193255029,tue jun 16 08:12:27 pdt 2009,no_query,jessie2point0,needs food iphone?
1599079,4,2193344265,tue jun 16 08:19:50 pdt 2009,no_query,sanityknit,text tell things going! miss guys!! fun - nice...
1599611,4,2193478364,tue jun 16 08:30:46 pdt 2009,no_query,kirstylol,needs food soo much. going watch t.v shower la...
1599831,4,2193551690,tue jun 16 08:36:43 pdt 2009,no_query,exbp_buddhist,"could born emporer penguin. minus 200, carryin..."


In [78]:
dfTrain = dfSubject.sample(frac = 0.8, random_state = 20)
dfTest = df.drop(dfTrain.index)

dfGraphModel = dfTrain.sample(frac = 0.6, random_state = 20)
dfDM = dfTrain.drop(dfGraphModel.index)

posSubject = dfGraphModel.loc[dfSubject['sentiment'] == 4]
negSubject = dfGraphModel.loc[dfSubject['sentiment'] == 0]
neutSubject = dfGraphModel.loc[dfSubject['sentiment'] == 2] #there is no neutral

In [83]:
def createGraph(text, wordGraph, frame):
    if text:
        text = re.sub(r'[^\w\s]', '', str(text))
        text = text.split()
        try:
            for x in range(len(text) - frame):
                for y in range(1, frame + 1):
                    wordGraph.add_edge(text[x], text[x+y])
            for x in reversed(range(1, frame + 1)):
                for y in reversed(range(1, x)):
                    wordGraph.add_edge(text[len(text) - x], text[len(text) - y])
        except IndexError:
            return createGraph(text, wordGraph, frame-1)
        return wordGraph
    else:
        return wordGraph
    
def createGraphFromTweet(text, frame):
    wordGraph = nx.DiGraph()
    if type(text) == str:
        print(text)
        createGraph(text, wordGraph, frame)
    if type(text) == list:
        for element in text:
            createGraph(element, wordGraph, frame)
    return wordGraph

In [89]:
posArrayText = posSubject.as_matrix(columns = posSubject.columns[-1:]).flatten().tolist()
negArrayText = negSubject.as_matrix(columns = negSubject.columns[-1:]).flatten().tolist()
posWordGraph = createGraphFromTweet(posArrayText, 3)
negWordGraph = createGraphFromTweet(negArrayText, 3)

  """Entry point for launching an IPython kernel.
  


In [106]:
#graph similary based on edge similarity

def edgeSimilarity(inputGraph, model):
    count = 0
    normalizationFactor = 1
    edgeList = model.edges()
    if len(inputGraph.edges()) <= len(edgeList):
        normalizationFactor = len(inputGraph.edges())
    else:
        normalizationFactor = len(edgeList)
    for edge in inputGraph.edges():
        if edge in edgeList: count += 1
    return count/normalizationFactor

In [107]:
#try kNN method
from sklearn.neighbors import KNeighborsClassifier as kNN

sentimentTweetDict = dfDM.to_dict(orient='index')
X = []
y = []

model = kNN(n_neighbors = 3)

for key, value in sentimentTweetDict.items():
    X.append(value['sentiment'])
    wordgraph = createGraphFromTweet(value['tweet'], 3)
    y.append(edgeSimilarity(wordGraph, posWordGraph))

everyone talking food ! havent eaten im allowed eat sssssh


ZeroDivisionError: division by zero