In [1]:
import numpy as np

import sklearn

from sklearn.preprocessing import normalize
from sklearn import datasets, linear_model

from random import shuffle

In [32]:
def loadTopicCategoryDict(fname):
    result = {}
    with open(fname, encoding="iso-8859-1") as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    contentParsed = [text for text in content if len(text) > 0]
    for line in contentParsed:
        tokens = line.split(",")
        result[tokens[0]] = tokens[1]
    return result
        
def quoteCustomSplit(text):
    firstIndex, secondIndex = -1,-1
    for i in range(len(text)):
        c_i = text[i]
        c_l = text[i-1] if i > 0 else None
        c_r = text[i+1] if i < len(text) - 1 else None
        if c_i == '"' and c_l != "\\" and firstIndex == -1:
            firstIndex = i
        elif c_i == '"' and c_r == ',' and firstIndex != -1:
            secondIndex = i
            newText = text[0:firstIndex] + text[firstIndex:secondIndex].replace(",", "") + text[secondIndex:]
    return newText.split(",")
  
def readTwitterData(fname, topicCategoryDict):
    parsedX = []
    parsedY = []
    
    with open(fname) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    contentParsed = [text for text in content if len(text) > 0]

    for line in contentParsed:
        data = quoteCustomSplit(line)

        newPoint = [float(x) for x in data[1:(len(data) - 4 - 3)]]

        topic = data[len(data) - 4]
        
        if topic not in topicCategoryDict:
            print("Could not find topic: " + topic)
            #print(topicCategoryDict)
            print("-------------------")
            continue
        
        category = topicCategoryDict[topic]
    

        if category == "S": newPoint = [1,0,0,0] + newPoint
        if category == "C": newPoint = [0,1,0,0] + newPoint
        if category == "P": newPoint = [0,0,1,0] + newPoint
        if category == "T": newPoint = [0,0,0,1] + newPoint

        parsedX.append(newPoint)
        
        label = 1.0 if float(data[len(data) - 1]) > 0.5 else 0.0
        parsedY.append(label)

    f.close()

    return parsedX, parsedY

In [56]:
topicDict = loadTopicCategoryDict("new_tweet_topics_category_dict.txt")
dataX, dataY = readTwitterData("new_tweets.txt", topicDict)

dataX = np.array(dataX)
dataY = np.array(dataY)

print(dataX[0])
print(dataY[0])

dataX = sklearn.preprocessing.normalize(dataX, axis=0)

print(dataX[0])
print(dataY[0])

bestX, bestY = None, None

regr = linear_model.LogisticRegression(penalty="l1")
regr.fit(dataX, dataY)

np.set_printoptions(suppress=True)

"""
linRegColumns = ["Topic: Sports", "Topic: Culture", "Topic: Politics", "Topic: Twitter/Misc.",
          "Emotion: Neutral", "Emotion: Angry", "Emotion: Sad", "Emotion: Happy/Hopeful", "Emotion: Funny/Satirical",
          "TIME2_6","TIME6_10","TIME10_14","TIME14_18","TIME18_22","TIME22_2",
          "DATE_SUN","DATE_MON","DATE_TUE","DATE_WED","DATE_THU","DATE_FRI","DATE_SAT",
          "PHOTO","VIDEO","ANIMATED_GIF",
          "LOG10_USER_FAV","LOG10_USER_STATUS_COUNT"]
"""

allColumnsString = "Topic: Sports,Topic: Culture,Topic: Politics,Topic: Twitter/Misc.,SENTIMENT," \
"TIME2_4,TIME4_6,TIME6_8,TIME8_10,TIME10_12,TIME12_14,TIME14_16,TIME16_18,TIME18_20,TIME20_22,TIME22_24,TIME24_2,"\
"DATE_SUN,DATE_MON,DATE_TUE,DATE_WED,DATE_THU,DATE_FRI,DATE_SAT,"\
"PHOTO,VIDEO,ANIMATED_GIF,"\
"USER_FAV,USER_STATUS_COUNT,"\
"FAVORITES,RETWEETS,TOPIC_SCORE,"\
"TOPIC,TEXT,SANITIZED_TEXT,"\
"SCORE"

linRegColumns = allColumnsString.split(",")

# The coefficients
print('Coefficients: \n')

#for i in range(len(regr.coef_)):
    #print(linRegColumns[i] + " -> %.2f" % regr.coef_[i])

print('\n')    
    
# The mean squared error
print("Mean squared error: %f"
      % np.mean((regr.predict(dataX) - dataY) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %f' % regr.score(dataX, dataY))

for i in range(100):
    pred = np.dot(regr.coef_[0].T, dataX[i])
    #pred = regr.predict(dataX)
    actual = dataY[i]
    print(str(pred) + " " + str(actual))
    pass

#clf = SVC()
#clf.fit(dataX, dataY)
#acc = clf.score(dataX, dataY)
#print(acc)

Could not find topic: Paraná
-------------------
Could not find topic: Paraná
-------------------
Could not find topic: Paraná
-------------------
Could not find topic: Paraná
-------------------
[      0.       1.       0.       0.       0.       1.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       1.       0.       0.       0.
       0.       0.       0.    1769.  175957.]
0.0
[ 0.          0.06019293  0.          0.          0.          0.10540926
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.07738232  0.          0.          0.          0.          0.          0.
  0.00386718  0.06919319]
0.0
Coefficients: 



Mean squared error: 0.348946
Variance score: 0.651054
-0.933376695133 0.0
-0.376441602554 0.0
-1.2600561279 0.0
-0.789115281349 0.0
-0.0460281014617 0.0
-0.0776744775114 1.0
-0.58277

In [35]:
linreg = np.linalg.lstsq(dataX, dataY)[0]
print(linreg)

bias = np.array([])

print("Mean squared error: %f"
      % np.mean((np.dot(dataX, linreg) - dataY) ** 2))

[  2.97617565e+14   1.13432185e+15   5.28880291e+14   5.79359332e+14
   3.75000000e-01  -3.21994687e+14  -2.42388795e+14  -1.85903719e+14
  -1.35764881e+14  -1.07331562e+14  -1.85903719e+14  -2.53992834e+14
  -1.66277341e+14  -1.01823661e+14  -1.07331562e+14  -1.39943236e+14
  -3.11076421e+14  -1.42628978e+13  -8.41079623e+13  -2.35402145e+14
  -4.43730950e+14  -4.94022420e+14  -1.26416955e+13   3.20442705e+13
   7.18750000e-01  -8.41308594e-01  -6.64062500e-02  -3.53515625e-01
  -3.78857422e+00]
Mean squared error: 0.208694
