In [62]:
import numpy as np

from sklearn import datasets, linear_model

from random import shuffle

In [30]:
def quoteCustomSplit(text):
    firstIndex, secondIndex = -1,-1
    for i in range(len(text)):
        c_i = text[i]
        c_l = text[i-1] if i > 0 else None
        c_r = text[i+1] if i < len(text) - 1 else None
        if c_i == '"' and c_l != "\\" and firstIndex == -1:
            firstIndex = i
        elif c_i == '"' and c_r == ',' and firstIndex != -1:
            secondIndex = i
            newText = text[0:firstIndex] + text[firstIndex:secondIndex].replace(",", "") + text[secondIndex:]
    return newText.split(",")
  
def readTwitterData(fname):
    parsedX = []
    parsedY = []
    
    with open(fname) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    contentParsed = [text for text in content if len(text) > 0]

    for line in contentParsed:
        data = quoteCustomSplit(line)
        label = float(data[len(data) - 1])
        parsedY.append(label)

        newPoint = [float(x) for x in data[2:(len(data) - 4)]]

        if data[1] == "N": newPoint = [1,0,0,0,0] + newPoint
        if data[1] == "A": newPoint = [0,1,0,0,0] + newPoint
        if data[1] == "S": newPoint = [0,0,1,0,0] + newPoint
        if data[1] == "H": newPoint = [0,0,0,1,0] + newPoint
        if data[1] == "F": newPoint = [0,0,0,0,1] + newPoint

        if data[0] == "S": newPoint = [1,0,0,0] + newPoint
        if data[0] == "C": newPoint = [0,1,0,0] + newPoint
        if data[0] == "P": newPoint = [0,0,1,0] + newPoint
        if data[0] == "T": newPoint = [0,0,0,1] + newPoint

        parsedX.append(newPoint)

    f.close()

    return parsedX, parsedY
  
def linRegTrainTest(dataX, dataLabels, validationPercent):
    assert len(dataX) == len(dataLabels)
    assert len(dataLabels) > 0

    dimensionWeight = len(dataX[0])

    allIndices = [i for i in range(len(dataX))]
    shuffle(allIndices)

    numValidation = int(validationPercent * len(dataX))

    validationX = np.array(dataX[0:numValidation])
    validationY = np.array(dataLabels[0:numValidation])

    trainX = np.array(dataX[numValidation:])
    trainY = np.array(dataLabels[numValidation:])

    #print(trainY)
    
    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(trainX, trainY)

    # The coefficients
    print('Coefficients: \n', regr.coef_)
    # The mean squared error
    print("Mean squared error: %.2f"
        % np.mean((regr.predict(validationX) - validationY) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % regr.score(validationX, validationY))

In [16]:
#res = quoteCustomSplit('A,long,list,"of,things,to talk",continued')
res = quoteCustomSplit('P,N,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2.0569048,4.264416,SNLFinale,"America, I hear you. \n\nTune in TONIGHT for our season finale of @nbcsnl for suprises and big laughs. \n#SNLFinale https://t.co/VZl2P96eH7",America I hear you in TONIGHT for our season finale of nbcsnl for suprises and big laughs nSNLFinale ,0.1775814')
print(res)

['P', 'N', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '2.0569048', '4.264416', 'SNLFinale', '"America I hear you. \n\nTune in TONIGHT for our season finale of @nbcsnl for suprises and big laughs. \n#SNLFinale https://t.co/VZl2P96eH7"', 'America I hear you in TONIGHT for our season finale of nbcsnl for suprises and big laughs nSNLFinale ', '0.1775814']


In [70]:
dataX, dataY = readTwitterData("vectorized_tweets.txt")

dataX = np.array(dataX)
dataY = np.array(dataY)

bestX, bestY = None, None

regr = linear_model.LinearRegression()
regr.fit(dataX, dataY)

np.set_printoptions(suppress=True)

columns = ["Topic: Sports", "Topic: Culture", "Topic: Politics", "Topic: Twitter/Misc.",
          "Emotion: Neutral", "Emotion: Angry", "Emotion: Sad", "Emotion: Happy/Hopeful", "Emotion: Funny/Satirical",
          "TIME2_6","TIME6_10","TIME10_14","TIME14_18","TIME18_22","TIME22_2",
          "DATE_SUN","DATE_MON","DATE_TUE","DATE_WED","DATE_THU","DATE_FRI","DATE_SAT",
          "PHOTO","VIDEO","ANIMATED_GIF",
          "LOG10_USER_FAV","LOG10_USER_STATUS_COUNT",
          "TOPIC","TEXT","SANITIZED_TEXT",
          "SCORE"]

# The coefficients
print('Coefficients: \n')

for i in range(len(columns)):
    print(columns[i] + " -> " + str(regr.coef_[i]))

# The mean squared error
print("Mean squared error: %f"
      % np.mean((regr.predict(dataX) - dataY) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %f' % regr.score(dataX, dataY))


#clf = SVC()
#clf.fit(dataX, dataY)
#acc = clf.score(dataX, dataY)
#print(acc)

Coefficients: 

Topic: Sports -> 0.995604974238
Topic: Culture -> -0.74045603834
Topic: Politics -> -0.255148935898
Topic: Twitter/Misc. -> 1.33226762955e-15
Emotion: Neutral -> -0.633951266583
Emotion: Angry -> 0.803707274451
Emotion: Sad -> -0.550671495721
Emotion: Happy/Hopeful -> 0.226597240693
Emotion: Funny/Satirical -> 0.15431824716
TIME2_6 -> 0.805855234921
TIME6_10 -> -0.636656441824
TIME10_14 -> -0.762955782533
TIME14_18 -> 0.111819516538
TIME18_22 -> 0.30564135706
TIME22_2 -> 0.176296115838
DATE_SUN -> -0.441301269962
DATE_MON -> -1.8392484026
DATE_TUE -> 2.37270841285
DATE_WED -> -1.66533453694e-16
DATE_THU -> 0.339786939176
DATE_FRI -> -1.06911912022
DATE_SAT -> 0.637173440766
PHOTO -> -0.129607791382
VIDEO -> -0.178828430825
ANIMATED_GIF -> -0.188755340249
LOG10_USER_FAV -> -0.0314399615159
LOG10_USER_STATUS_COUNT -> -2.84858945355


IndexError: index 27 is out of bounds for axis 0 with size 27