In [23]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import networkx as nx
import re

import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to C:\Users\Harrison
[nltk_data]     Lu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv('./dataset/sentiment140/data.csv', encoding='latin-1', header = None)
df.columns = ['sentiment', 'ID', 'date', 'query', 'username', 'text']

df = df.applymap(lambda s: s.lower() if type(s) == str else s)
df.head()

Unnamed: 0,sentiment,ID,date,query,username,text
0,0,1467810369,mon apr 06 22:19:45 pdt 2009,no_query,_thespecialone_,"@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,1467810672,mon apr 06 22:19:49 pdt 2009,no_query,scotthamilton,is upset that he can't update his facebook by ...
2,0,1467810917,mon apr 06 22:19:53 pdt 2009,no_query,mattycus,@kenichan i dived many times for the ball. man...
3,0,1467811184,mon apr 06 22:19:57 pdt 2009,no_query,ellectf,my whole body feels itchy and like its on fire
4,0,1467811193,mon apr 06 22:19:57 pdt 2009,no_query,karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
dfTrain = df.sample(frac = 0.8, random_state = 20)
dfTest = df.drop(dfTrain.index)

dfGraphModel = dfTrain.sample(frac = 0.5, random_state = 20)
dfDM = dfTrain.drop(dfGraphModel.index)

#subset of the dfTrain dataset, split into positive, negative, and neutral tweets
pos = dfGraphModel.loc[df['sentiment'] == 4]
neg = dfGraphModel.loc[df['sentiment'] == 0]
neut = dfGraphModel.loc[df['sentiment'] == 2]

posText = {line["ID"]: line["text"] for index, line in pos.iterrows()}
negText = {line["ID"]: line["text"] for index, line in neg.iterrows()}

posWords = [line.rstrip('\n') for line in open('./dataset/positive-words.txt') if line.split() and list(line)[0] != ';']
negWords = [line.rstrip('\n') for line in open('./dataset/negative-words.txt') if line.split() and list(line)[0] != ";"]

In [77]:
subjectTweetsDict = {} # key = index in original df and value = tweet info (sentiment, id, date, query, username, full tweet)
filteredTweets = {} # key = index in original df and value = filtered tweet
count = 0
for row in df.itertuples():
    if 'food' in row[6]:
        subjectTweetsDict[row[0]] = list(row)[1:7]
        word_tokens = str(row[6]).split() #split by white space
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        filtered_sentence = [] 
        i = 0
        while i < len(word_tokens):
            if '@' in word_tokens[i]: #Taking out handles from tweets
                i = i + 1
            elif word_tokens[i] not in stop_words:
                filtered_sentence.append(word_tokens[i])
            i = i + 1
        filteredTweets[row[0]] = filtered_sentence
#print(subjectTweetsDict)
#df2 = pd.DataFrame(data=subjectTweetsDict, columns = ['index', 'sentiment', 'id', 'date', 'query', 'username', 'tweet'])
dfSubject= pd.DataFrame.from_dict(subjectTweetsDict, orient = 'index', columns = ['sentiment', 'id', 'date', 'query', 'username', 'tweet'])
for key, value in filteredTweets.items():
    filteredString = ' '.join(value)
    dfSubject.at[key, "tweet"] = filteredString

dfSubject.tail()

Unnamed: 0,sentiment,id,date,query,username,tweet
1598507,4,2193188422,tue jun 16 08:06:59 pdt 2009,no_query,melisarenea,ughh hate wake early drink...aren't supposed s...
1598732,4,2193255029,tue jun 16 08:12:27 pdt 2009,no_query,jessie2point0,needs food iphone?
1599079,4,2193344265,tue jun 16 08:19:50 pdt 2009,no_query,sanityknit,text tell things going! miss guys!! fun - nice...
1599611,4,2193478364,tue jun 16 08:30:46 pdt 2009,no_query,kirstylol,needs food soo much. going watch t.v shower la...
1599831,4,2193551690,tue jun 16 08:36:43 pdt 2009,no_query,exbp_buddhist,"could born emporer penguin. minus 200, carryin..."


In [155]:
dfTrain = dfSubject.sample(frac = 0.8, random_state = 20)
dfTest = dfSubject.drop(dfTrain.index)

dfGraphModel = dfTrain.sample(frac = 0.6, random_state = 20)
dfDM = dfTrain.drop(dfGraphModel.index)

posSubject = dfGraphModel.loc[dfSubject['sentiment'] == 4]
negSubject = dfGraphModel.loc[dfSubject['sentiment'] == 0]
neutSubject = dfGraphModel.loc[dfSubject['sentiment'] == 2] #there is no neutral

In [113]:
def createGraph(text, wordGraph, frame):
    if text:
        text = re.sub(r'[^\w\s]', '', str(text))
        text = text.split()
        try:
            for x in range(len(text) - frame):
                for y in range(1, frame + 1):
                    wordGraph.add_edge(text[x], text[x+y])
            for x in reversed(range(1, frame + 1)):
                for y in reversed(range(1, x)):
                    wordGraph.add_edge(text[len(text) - x], text[len(text) - y])
        except IndexError:
            return createGraph(text, wordGraph, frame-1)
        return wordGraph
    else:
        return wordGraph
    
def createGraphFromTweet(text, frame):
    wordGraph = nx.DiGraph()
    if type(text) == str:
        createGraph(text, wordGraph, frame)
    if type(text) == list:
        for element in text:
            createGraph(element, wordGraph, frame)
    return wordGraph

In [391]:
posArrayText = posSubject.as_matrix(columns = posSubject.columns[-1:]).flatten().tolist()
negArrayText = negSubject.as_matrix(columns = negSubject.columns[-1:]).flatten().tolist()
posWordGraph = createGraphFromTweet(posArrayText, 4)
negWordGraph = createGraphFromTweet(negArrayText, 4)

  """Entry point for launching an IPython kernel.
  


In [257]:
#graph similary functions

def edgeSimilarity(inputGraph, model):
    count = 0
    for edge in inputGraph.edges():
        n1, n2 = edge
        if model.has_edge(n1, n2): 
            count += 1
    return count/min(len(inputGraph), len(model))

def getMCS(graphModel, tweetGraph):
    matching_graph=nx.Graph() #subgraph

    for n1,n2,attr in tweetGraph.edges(data=True):
        if graphModel.has_edge(n1,n2) :
            matching_graph.add_edge(n1,n2,weight=1)

    graphs = list(nx.connected_component_subgraphs(matching_graph))

    mcs_length = 0
    mcs_graph = nx.DiGraph() 
    
    for i, graph in enumerate(graphs):                        #Finding maximum subgraph out of all graphs

        if len(graph.nodes()) > mcs_length: 
            mcs_length = len(graph.nodes())
            mcs_graph = graph

    return mcs_graph

def MCSNS(mcs_graph, graphModel, tweetGraph): #number of nodes in common subgraph divided by minimum number of nodes
    return len(mcs_graph)/min(len(graphModel),len(tweetGraph))

def MCSUES(mcs_graph, graphModel, tweetGraph): #number of edges in MCS divided by min number of nodes
    return len(mcs_graph.edges())/min(len(graphModel),len(tweetGraph))

def MCSDES(mcs_graph, graphModel, tweetGraph): #edges in the mcs_graph are the same direction in both graphs
    count = 0
    for e1,e2 in mcs_graph.edges():
        if tweetGraph.has_edge(e1,e2) and graphModel.has_edge(e1,e2):
            count+=1
    return count/min(len(graphModel),len(tweetGraph))

In [259]:
sentimentTweetDict = dfDM.to_dict(orient='index')

def generateWordGraphVectors(dataframeDict, posWordGraph, negWordGraph, metricType):
    X = []
    y = []
    count = 0
    for key, value in dataframeDict.items():
        if value['tweet']:
            y.append(value['sentiment'])
            wordGraph = createGraphFromTweet(value['tweet'], 4)
            posNegArray = []
            if metricType == "edge":
                posNegArray.append(edgeSimilarity(wordGraph, posWordGraph))
                posNegArray.append(edgeSimilarity(wordGraph, negWordGraph))
            elif metricType == "MCSNS":
                mcs_graph = getMCS(posWordGraph, wordGraph)
                posNegArray.append(MCSNS(mcs_graph, wordGraph, posWordGraph))
                mcs_graph = getMCS(negWordGraph, wordGraph)
                posNegArray.append(MCSNS(mcs_graph, wordGraph, negWordGraph))
            elif metricType == "MCSUES":
                mcs_graph = getMCS(posWordGraph, wordGraph)
                posNegArray.append(MCSUES(mcs_graph, wordGraph, posWordGraph))
                mcs_graph = getMCS(negWordGraph, wordGraph)
                posNegArray.append(MCSUES(mcs_graph, wordGraph, negWordGraph))
            elif metricType == "MCSDES":
                mcs_graph = getMCS(posWordGraph, wordGraph)
                posNegArray.append(MCSDES(mcs_graph, wordGraph, posWordGraph))
                mcs_graph = getMCS(negWordGraph, wordGraph)
                posNegArray.append(MCSDES(mcs_graph, wordGraph, negWordGraph))
            X.append(posNegArray)
            count += 1
    return X, y

X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "edge")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "edge")

In [261]:
#try kNN method
from sklearn.neighbors import KNeighborsClassifier as kNN


model = kNN(n_neighbors = 5)
model.fit(X, y)
model.score(Xtest, ytest)

0.582345971563981

In [262]:
#svm
from sklearn import svm

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
classifier.score(Xtest, ytest)

0.620260663507109

In [263]:
#linear regression
from sklearn.linear_model import LogisticRegression as logreg

classifier = logreg()
classifier.fit(X, y)
classifier.score(Xtest, ytest)

0.625

In [264]:
#decision tree

from sklearn.tree import DecisionTreeClassifier as dectree

classifier = dectree()
classifier.fit(X, y)
classifier.score(Xtest, ytest)

0.5598341232227488

In [404]:
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "MCSNS")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "MCSNS")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5438388625592417
SVM score: 0.580568720379147
Logistic regression score: 0.5817535545023697
Decision tree score: 0.5545023696682464


In [406]:
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "MCSUES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "MCSUES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5989336492890995
SVM score: 0.6149289099526066
Logistic regression score: 0.6137440758293838
Decision tree score: 0.5610189573459715


In [267]:
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "MCSDES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "MCSDES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5361374407582938
SVM score: 0.5669431279620853
Logistic regression score: 0.5728672985781991
Decision tree score: 0.5219194312796208


In [422]:
#trimming graphs based on a hard limit- how many times a word was used. basically, edge weight of 1 or 0

def createTrimmedGraph(text, wordGraph, frame):
    if text:
        text = re.sub(r'[^\w\s]', '', str(text))
        text = text.split()
        try:
            if len(text) == 1:
                wordGraph.add_edge(text[0], text[0], weight = 1)
            for x in range(len(text) - frame):
                for y in range(1, frame + 1):
                    n1 = text[x]
                    n2 = text[x+y]
                    if wordGraph.has_edge(n1, n2):
                        wordGraph[n1][n2]['weight'] = wordGraph[n1][n2]['weight'] + 1
                    else:
                        wordGraph.add_edge(n1, n2, weight = 1)
            for x in reversed(range(1, frame + 1)):
                for y in reversed(range(1, x)):
                    n1 = text[len(text) - x]
                    n2 = text[len(text) - y]
                    if wordGraph.has_edge(n1, n2):
                        wordGraph[n1][n2]['weight'] = wordGraph[n1][n2]['weight'] + 1
                    else:
                        wordGraph.add_edge(n1, n2, weight = 1)
        except IndexError:
            return createGraph(text, wordGraph, frame-1)
        return wordGraph
    else:
        return wordGraph
    
def createGraphWithWeightsFromTweet(text, frame):
    wordGraph = nx.DiGraph()
    if type(text) == str:
        createTrimmedGraph(text, wordGraph, frame)
    if type(text) == list:
        for element in text:
            createTrimmedGraph(element, wordGraph, frame)
    return wordGraph

In [455]:
posWordGraph = createGraphWithWeightsFromTweet(posArrayText, 4)
negWordGraph = createGraphWithWeightsFromTweet(negArrayText, 4)

In [463]:
def removeEdgesByWeight(graph, threshold):
    returnGraph = graph.copy()
    edgeCountDict = nx.get_edge_attributes(returnGraph, 'weight')
    for key, value in edgeCountDict.items():
        if value <= threshold:
            returnGraph.remove_edge(*key)
    emptyNodes = list(nx.isolates(returnGraph))
    returnGraph.remove_nodes_from(emptyNodes)
    return returnGraph

In [440]:
nx.get_edge_attributes(posWordGraph, 'weight')

{('ive', 'scones'): 1,
 ('ive', 'toothey'): 1,
 ('ive', 'help'): 1,
 ('ive', 'lose'): 1,
 ('ive', 'gotta'): 1,
 ('ive', 'tell'): 1,
 ('ive', 'something'): 1,
 ('ive', 'got'): 1,
 ('ive', 'headache'): 1,
 ('ive', 'might'): 1,
 ('ive', 'go'): 1,
 ('ive', 'waitin'): 1,
 ('ive', '4'): 1,
 ('ive', '4ever'): 1,
 ('ive', 'started'): 1,
 ('ive', 'cooking'): 1,
 ('ive', 'family'): 1,
 ('ive', 'ministry'): 1,
 ('ive', 'tried'): 3,
 ('ive', 'foriegn'): 1,
 ('ive', 'food'): 5,
 ('ive', 'craving'): 2,
 ('ive', 'sweet'): 1,
 ('ive', 'potatoes'): 1,
 ('ive', 'almonds'): 1,
 ('ive', 'still'): 1,
 ('ive', 'sleep'): 1,
 ('ive', '40'): 1,
 ('ive', 'broken'): 1,
 ('ive', 'found'): 1,
 ('ive', 'seat'): 1,
 ('ive', 'bbq'): 1,
 ('ive', 'right'): 1,
 ('ive', 'decided'): 1,
 ('ive', 'gone'): 1,
 ('ive', '2'): 1,
 ('ive', 'culinary'): 1,
 ('ive', 'chopped'): 1,
 ('ive', 'loooooads'): 1,
 ('ive', 'greens'): 1,
 ('ive', 'figure'): 1,
 ('ive', 'never'): 3,
 ('ive', 'well'): 1,
 ('ive', 'see'): 1,
 ('ive', 'eating'

In [466]:
posWordGraphTrimmed = removeEdgesByWeight(posWordGraph, 8)
negWordGraphTrimmed = removeEdgesByWeight(negWordGraph, 8)

X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "edge")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "edge")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5864928909952607
SVM score: 0.5746445497630331
Logistic regression score: 0.591824644549763
Decision tree score: 0.5746445497630331


In [469]:
print(len(negWordGraphTrimmed))
print(len(negWordGraph))

135
5272


In [401]:
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSNS")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSNS")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5438388625592417
SVM score: 0.580568720379147
Logistic regression score: 0.5817535545023697
Decision tree score: 0.5545023696682464


In [408]:
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSUES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSUES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5989336492890995
SVM score: 0.6149289099526066
Logistic regression score: 0.6137440758293838
Decision tree score: 0.5598341232227488


In [410]:
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSDES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSDES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5361374407582938
SVM score: 0.5669431279620853
Logistic regression score: 0.5728672985781991
Decision tree score: 0.5207345971563981
