In [2]:
import json
import matplotlib as mpl
import numpy as np
import sklearn as skl

## Read and preprocess data from json

In [3]:
sent_f = open("data/sentiment_data.json")
twitter_f = open("data/tweets.json")
google_dict = json.load(sent_f)
twitter_dict = json.load(twitter_f)

In [4]:
Xs = np.array(list(twitter_dict.values()))
Ys = []
for id in google_dict.keys():
    Ys.append(google_dict[id]["score"])
Ys = np.array(Ys)
Ys = (Ys*10).astype(int)

#### Create train test split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, Ys, test_size=0.15)

#### Vectorize strings so they can go into models

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

## Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))

Mean squared error: 0.66
Coefficient of determination (r2): 0.96


## Support Vector Machine

In [8]:
from sklearn.svm import SVR
regr = SVR()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))


Mean squared error: 4.81
Coefficient of determination (r2): 0.69


## XGBoost

In [10]:
from xgboost.sklearn import XGBRegressor
regr = XGBRegressor(verbosity=0)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))

Mean squared error: 1.67
Coefficient of determination (r2): 0.89
