# imports

In [1]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import networkx as nx
import re

import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.linear_model import LogisticRegression as logreg
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier as dectree

[nltk_data] Downloading package stopwords to C:\Users\Harrison
[nltk_data]     Lu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# explore data

In [2]:
df = pd.read_csv('./dataset/sentiment140/data.csv', encoding='latin-1', header = None)
df.columns = ['sentiment', 'ID', 'date', 'query', 'username', 'text']

df = df.applymap(lambda s: s.lower() if type(s) == str else s)
df.head()

Unnamed: 0,sentiment,ID,date,query,username,text
0,0,1467810369,mon apr 06 22:19:45 pdt 2009,no_query,_thespecialone_,"@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,1467810672,mon apr 06 22:19:49 pdt 2009,no_query,scotthamilton,is upset that he can't update his facebook by ...
2,0,1467810917,mon apr 06 22:19:53 pdt 2009,no_query,mattycus,@kenichan i dived many times for the ball. man...
3,0,1467811184,mon apr 06 22:19:57 pdt 2009,no_query,ellectf,my whole body feels itchy and like its on fire
4,0,1467811193,mon apr 06 22:19:57 pdt 2009,no_query,karoli,"@nationwideclass no, it's not behaving at all...."


# splitting entire dataset

not used in experiment below yet

In [3]:
dfTrain = df.sample(frac = 0.8, random_state = 20)
dfTest = df.drop(dfTrain.index)

dfGraphModel = dfTrain.sample(frac = 0.5, random_state = 20)
dfDM = dfTrain.drop(dfGraphModel.index)

#subset of the dfTrain dataset, split into positive, negative, and neutral tweets
pos = dfGraphModel.loc[df['sentiment'] == 4]
neg = dfGraphModel.loc[df['sentiment'] == 0]
neut = dfGraphModel.loc[df['sentiment'] == 2]

posText = {line["ID"]: line["text"] for index, line in pos.iterrows()}
negText = {line["ID"]: line["text"] for index, line in neg.iterrows()}

posWords = [line.rstrip('\n') for line in open('./dataset/positive-words.txt') if line.split() and list(line)[0] != ';']
negWords = [line.rstrip('\n') for line in open('./dataset/negative-words.txt') if line.split() and list(line)[0] != ";"]

# filtering by target word "food"
also removing handles. 

result in 4000 something tweets

In [5]:
subjectTweetsDict = {} # key = index in original df and value = tweet info (sentiment, id, date, query, username, full tweet)
filteredTweets = {} # key = index in original df and value = filtered tweet
count = 0
for row in df.itertuples():
    if 'food' in row[6]:
        subjectTweetsDict[row[0]] = list(row)[1:7]
        word_tokens = str(row[6]).split() #split by white space
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        filtered_sentence = [] 
        i = 0
        while i < len(word_tokens):
            if '@' in word_tokens[i]: #Taking out handles from tweets
                i = i + 1
            elif word_tokens[i] not in stop_words:
                filtered_sentence.append(word_tokens[i])
            i = i + 1
        filteredTweets[row[0]] = filtered_sentence
#print(subjectTweetsDict)
#df2 = pd.DataFrame(data=subjectTweetsDict, columns = ['index', 'sentiment', 'id', 'date', 'query', 'username', 'tweet'])
dfSubject= pd.DataFrame.from_dict(subjectTweetsDict, orient = 'index', columns = ['sentiment', 'id', 'date', 'query', 'username', 'text'])
for key, value in filteredTweets.items():
    filteredString = ' '.join(value)
    dfSubject.at[key, "text"] = filteredString

dfSubject.tail()

Unnamed: 0,sentiment,id,date,query,username,text
1598507,4,2193188422,tue jun 16 08:06:59 pdt 2009,no_query,melisarenea,ughh hate wake early drink...aren't supposed s...
1598732,4,2193255029,tue jun 16 08:12:27 pdt 2009,no_query,jessie2point0,needs food iphone?
1599079,4,2193344265,tue jun 16 08:19:50 pdt 2009,no_query,sanityknit,text tell things going! miss guys!! fun - nice...
1599611,4,2193478364,tue jun 16 08:30:46 pdt 2009,no_query,kirstylol,needs food soo much. going watch t.v shower la...
1599831,4,2193551690,tue jun 16 08:36:43 pdt 2009,no_query,exbp_buddhist,"could born emporer penguin. minus 200, carryin..."


## splitting our dataset into training, testing set.
split training into creating graph model, as well as creating the data mining model. 
split graph model into positive, negative, and neutral.

no neutral tweets, as we have found

In [6]:
dfTrain = dfSubject.sample(frac = 0.8, random_state = 20)
dfTest = dfSubject.drop(dfTrain.index)

dfGraphModel = dfTrain.sample(frac = 0.6, random_state = 20)
dfDM = dfTrain.drop(dfGraphModel.index)

posSubject = dfGraphModel.loc[dfSubject['sentiment'] == 4]
negSubject = dfGraphModel.loc[dfSubject['sentiment'] == 0]
neutSubject = dfGraphModel.loc[dfSubject['sentiment'] == 2] #there is no neutral

# writing functions to evaluate how each model works

In [9]:
'''this function creates the word graph given the text in the tweet and the 
frame that is used to create the graph model. Can accept both an array of strings to 
treat as one long string, or a single string.'''

def createGraphFromTweet(text, frame):
    wordGraph = nx.DiGraph()
    if type(text) == str:
        createGraph(text, wordGraph, frame)
    if type(text) == list:
        for element in text:
            createGraph(element, wordGraph, frame)
    return wordGraph

'''helper function- NOT FOR USE.'''
def createGraph(text, wordGraph, frame):
    if text:
        text = re.sub(r'[^\w\s]', '', str(text))
        text = text.split()
        try:
            if len(text) == 1:
                wordGraph.add_edge(text[0], text[0], weight = 1)
            for x in range(len(text) - frame):
                for y in range(1, frame + 1):
                    n1 = text[x]
                    n2 = text[x+y]
                    if wordGraph.has_edge(n1, n2):
                        wordGraph[n1][n2]['weight'] = wordGraph[n1][n2]['weight'] + 1
                    else:
                        wordGraph.add_edge(n1, n2, weight = 1)
            for x in reversed(range(1, frame + 1)):
                for y in reversed(range(1, x)):
                    n1 = text[len(text) - x]
                    n2 = text[len(text) - y]
                    if wordGraph.has_edge(n1, n2):
                        wordGraph[n1][n2]['weight'] = wordGraph[n1][n2]['weight'] + 1
                    else:
                        wordGraph.add_edge(n1, n2, weight = 1)
        except IndexError:
            return createGraph(text, wordGraph, frame-1)
        return wordGraph
    else:
        return wordGraph


In [10]:
'''extracting text from the dataframe'''
posArrayText = posSubject.as_matrix(columns = posSubject.columns[-1:]).flatten().tolist()
negArrayText = negSubject.as_matrix(columns = negSubject.columns[-1:]).flatten().tolist()

'''putting text into one large graph'''
posWordGraph = createGraphFromTweet(posArrayText, 4)
negWordGraph = createGraphFromTweet(negArrayText, 4)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
'''GRAPH SIMILARITY FUNCTIONS'''

'''counts the number of identical edges between two graphs, normalizes by the minimum number of nodes in either graph'''
def edgeSimilarity(inputGraph, model):
    count = 0
    for edge in inputGraph.edges():
        n1, n2 = edge
        if model.has_edge(n1, n2): 
            count += 1
    return count/min(len(inputGraph), len(model))

'''returns maximum common subgraph of inputs'''
def getMCS(graphModel, tweetGraph):
    matching_graph=nx.Graph() #subgraph
    for n1,n2,attr in tweetGraph.edges(data=True):
        if graphModel.has_edge(n1,n2) :
            matching_graph.add_edge(n1,n2,weight=1)
    graphs = list(nx.connected_component_subgraphs(matching_graph))
    mcs_length = 0
    mcs_graph = nx.DiGraph() 
    
    for i, graph in enumerate(graphs):                        
        if len(graph.nodes()) > mcs_length: 
            mcs_length = len(graph.nodes())
            mcs_graph = graph
    return mcs_graph

'''returns number of common nodes in MCS and model graph, normalized by minimum number of nodes'''
def MCSNS(mcs_graph, graphModel, tweetGraph):
    return len(mcs_graph)/min(len(graphModel),len(tweetGraph))

'''returns number of common edges in MCS and model graph, noramlized by minimum number of nodes'''
def MCSUES(mcs_graph, graphModel, tweetGraph): 
    return len(mcs_graph.edges())/min(len(graphModel),len(tweetGraph))

'''returns number of common edges in the MCS and model graph, taking direction into account,
normalized by minimum number of nodes'''
def MCSDES(mcs_graph, graphModel, tweetGraph): 
    count = 0
    for e1,e2 in mcs_graph.edges():
        if tweetGraph.has_edge(e1,e2) and graphModel.has_edge(e1,e2):
            count+=1
    return count/min(len(graphModel),len(tweetGraph))

In [18]:
'''generates the feature vectors and labels (posValue, negValue).
   -dataframeDict is the dataframe above turned into a dictionary, with ID as 
   key and the rest of the information as value, stored as a nested dictionary
   -posWordGraph/negWordGraph are our pos/neg graph models
   -metric type refers to one of the above functions listed in the above cell block'''
def generateWordGraphVectors(dataframeDict, posWordGraph, negWordGraph, metricType):
    X = []
    y = []
    count = 0
    for key, value in dataframeDict.items():
        if value['text']:
            y.append(value['sentiment'])
            wordGraph = createGraphFromTweet(value['text'], 4)
            posNegArray = []
            if metricType == "edge":
                posNegArray.append(edgeSimilarity(wordGraph, posWordGraph))
                posNegArray.append(edgeSimilarity(wordGraph, negWordGraph))
            elif metricType == "MCSNS":
                mcs_graph = getMCS(posWordGraph, wordGraph)
                posNegArray.append(MCSNS(mcs_graph, wordGraph, posWordGraph))
                mcs_graph = getMCS(negWordGraph, wordGraph)
                posNegArray.append(MCSNS(mcs_graph, wordGraph, negWordGraph))
            elif metricType == "MCSUES":
                mcs_graph = getMCS(posWordGraph, wordGraph)
                posNegArray.append(MCSUES(mcs_graph, wordGraph, posWordGraph))
                mcs_graph = getMCS(negWordGraph, wordGraph)
                posNegArray.append(MCSUES(mcs_graph, wordGraph, negWordGraph))
            elif metricType == "MCSDES":
                mcs_graph = getMCS(posWordGraph, wordGraph)
                posNegArray.append(MCSDES(mcs_graph, wordGraph, posWordGraph))
                mcs_graph = getMCS(negWordGraph, wordGraph)
                posNegArray.append(MCSDES(mcs_graph, wordGraph, negWordGraph))
            X.append(posNegArray)
            count += 1
    return X, y


In [19]:
'''turn our dataframe into dictionary format'''
sentimentTweetDict = dfDM.to_dict(orient='index')

In [20]:
'''evaluate model, vectors generated by edgeSimilarity function. 
   Model evaluated using kNN, SVM, logistic regression, and decision tree.'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "edge")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "edge")


model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.582345971563981
SVM score: 0.620260663507109
Logistic regression score: 0.625
Decision tree score: 0.5598341232227488


In [21]:
'''identical as above, except vectors generated by MCSNS'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "MCSNS")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "MCSNS")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5438388625592417
SVM score: 0.580568720379147
Logistic regression score: 0.5817535545023697
Decision tree score: 0.5556872037914692


In [22]:
'''identical as above, except vectors generated by MCSUES'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "MCSUES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "MCSUES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5989336492890995
SVM score: 0.6149289099526066
Logistic regression score: 0.6137440758293838
Decision tree score: 0.5598341232227488


In [23]:
'''identical as above, except vectors generated by MCSDES'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraph, negWordGraph, "MCSDES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraph, negWordGraph, "MCSDES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5616113744075829
SVM score: 0.5752369668246445
Logistic regression score: 0.5829383886255924
Decision tree score: 0.5284360189573459


# here we start considering edge weights. 
weights = how many times the edge appeared.

In [24]:
'''remove edges that have under a certain weight, also remove isolated 
nodes after we run this function.'''
def removeEdgesByWeight(graph, threshold):
    returnGraph = graph.copy()
    edgeCountDict = nx.get_edge_attributes(returnGraph, 'weight')
    for key, value in edgeCountDict.items():
        if value <= threshold:
            returnGraph.remove_edge(*key)
    emptyNodes = list(nx.isolates(returnGraph))
    returnGraph.remove_nodes_from(emptyNodes)
    return returnGraph

In [29]:
'''trim edges'''
posWordGraphTrimmed = removeEdgesByWeight(posWordGraph, 1)
negWordGraphTrimmed = removeEdgesByWeight(negWordGraph, 1)

In [25]:
'''evaluate, same as above, but with new "trimmed" graphs, using edge similarity'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "edge")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "edge")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5989336492890995
SVM score: 0.6226303317535545
Logistic regression score: 0.6279620853080569
Decision tree score: 0.5864928909952607


In [26]:
'''evaluate, same as above, but with new "trimmed" graphs, using MCSNS'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSNS")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSNS")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5639810426540285
SVM score: 0.5764218009478673
Logistic regression score: 0.5930094786729858
Decision tree score: 0.5633886255924171


In [27]:
'''evaluate, same as above, but with new "trimmed" graphs, using MCSUES'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSUES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSUES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5829383886255924
SVM score: 0.5995260663507109
Logistic regression score: 0.6167061611374408
Decision tree score: 0.5651658767772512


In [28]:
'''evaluate, same as above, but with new "trimmed" graphs, using MCSDES'''
X, y = generateWordGraphVectors(sentimentTweetDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSDES")
testDict = dfTest.to_dict(orient='index')
Xtest, ytest = generateWordGraphVectors(testDict, posWordGraphTrimmed, negWordGraphTrimmed, "MCSDES")

model = kNN(n_neighbors = 5)
model.fit(X, y)
print("kNN score: " + str(model.score(Xtest, ytest)))

classifier = svm.SVC(kernel = "linear")
classifier.fit(X, y)
print("SVM score: " + str(classifier.score(Xtest, ytest)))

classifier = logreg()
classifier.fit(X, y)
print("Logistic regression score: " + str(classifier.score(Xtest, ytest)))

classifier = dectree()
classifier.fit(X, y)
print("Decision tree score: " + str(classifier.score(Xtest, ytest)))

kNN score: 0.5657582938388626
SVM score: 0.5740521327014217
Logistic regression score: 0.601303317535545
Decision tree score: 0.5841232227488151
