In [1]:
import numpy as np

import sklearn

from sklearn.preprocessing import normalize
from sklearn import datasets, linear_model

from random import shuffle

In [2]:
def loadTopicCategoryDict(fname):
    result = {}
    with open(fname) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    contentParsed = [text for text in content if len(text) > 0]
    for line in contentParsed:
        tokens = line.split(",")
        result[tokens[0]] = tokens[1]
    return result
        
def quoteCustomSplit(text):
    firstIndex, secondIndex = -1,-1
    for i in range(len(text)):
        c_i = text[i]
        c_l = text[i-1] if i > 0 else None
        c_r = text[i+1] if i < len(text) - 1 else None
        if c_i == '"' and c_l != "\\" and firstIndex == -1:
            firstIndex = i
        elif c_i == '"' and c_r == ',' and firstIndex != -1:
            secondIndex = i
            newText = text[0:firstIndex] + text[firstIndex:secondIndex].replace(",", "") + text[secondIndex:]
    return newText.split(",")
  
def readTwitterData(fname, topicCategoryDict):
    parsedX = []
    parsedY = []
    
    with open(fname) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    contentParsed = [text for text in content if len(text) > 0]

    for line in contentParsed:
        data = quoteCustomSplit(line)

        newPoint = [float(x) for x in data[1:(len(data) - 4)]]

        topic = data[len(data) - 4]
        
        if topic not in topicCategoryDict:
            print("Could not find topic: " + topic)
            print(topicCategoryDict)
            print("-------------------")
        
        category = topicCategoryDict[topic]
    

        if category == "S": newPoint = [1,0,0,0] + newPoint
        if category == "C": newPoint = [0,1,0,0] + newPoint
        if category == "P": newPoint = [0,0,1,0] + newPoint
        if category == "T": newPoint = [0,0,0,1] + newPoint

        parsedX.append(newPoint)
        
        label = float(data[len(data) - 1])
        parsedY.append(label)

    f.close()

    return parsedX, parsedY

In [5]:
topicDict = loadTopicCategoryDict("new_tweet_topics_category_dict.txt")
dataX, dataY = readTwitterData("new_tweets.txt", topicDict)

dataX = np.array(dataX)
dataY = np.array(dataY)

dataX = sklearn.preprocessing.normalize(dataX, axis=1)

bestX, bestY = None, None

regr = linear_model.LinearRegression()
regr.fit(dataX, dataY)

np.set_printoptions(suppress=True)

"""
linRegColumns = ["Topic: Sports", "Topic: Culture", "Topic: Politics", "Topic: Twitter/Misc.",
          "Emotion: Neutral", "Emotion: Angry", "Emotion: Sad", "Emotion: Happy/Hopeful", "Emotion: Funny/Satirical",
          "TIME2_6","TIME6_10","TIME10_14","TIME14_18","TIME18_22","TIME22_2",
          "DATE_SUN","DATE_MON","DATE_TUE","DATE_WED","DATE_THU","DATE_FRI","DATE_SAT",
          "PHOTO","VIDEO","ANIMATED_GIF",
          "LOG10_USER_FAV","LOG10_USER_STATUS_COUNT"]
"""

# The coefficients
print('Coefficients: \n')

for i in range(len(regr.coef_)):
    print(" -> %.2f" % regr.coef_[i])

print('\n')    
    
# The mean squared error
print("Mean squared error: %f"
      % np.mean((regr.predict(dataX) - dataY) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %f' % regr.score(dataX, dataY))


#clf = SVC()
#clf.fit(dataX, dataY)
#acc = clf.score(dataX, dataY)
#print(acc)

KeyError: 'Paraná'