In [1]:
import json
import matplotlib as mpl
import numpy as np
import sklearn as skl

## Read and preprocess data from json

In [2]:
sent_f = open("data/sentiment_data.json")
twitter_f = open("data/tweets.json")
google_dict = json.load(sent_f)
twitter_dict = json.load(twitter_f)

In [4]:
Xs = np.array(list(twitter_dict.values()))
Ys = []
for id in google_dict.keys():
    Ys.append(google_dict[id]["Score"])
Ys = np.array(Ys)
Ys = (Ys*10).astype(int)

#### Create train test split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, Ys, test_size=0.15)

#### Vectorize strings so they can go into models

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

#### Create baseline data set

In [12]:
print(np.shape(X_test))

(195, 3557)


In [16]:
rand_X = np.random.randint(low=-10, high=10, size=np.shape(X_test))

## Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
rand_pred = regr.predict(rand_X)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))
print(f"Baseline r2: {skl.metrics.r2_score(y_test,rand_pred)}")

Mean squared error: 3.81
Coefficient of determination (r2): 0.64
Baseline r2: -2189.614496142303


## Support Vector Machine

In [20]:
from sklearn.svm import SVR
regr = SVR()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
rand_pred = regr.predict(rand_X)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))
print(f"Baseline r2: {skl.metrics.r2_score(y_test,rand_pred)}")


Mean squared error: 5.02
Coefficient of determination (r2): 0.52
Baseline r2: -0.11620441976420581


## XGBoost

In [21]:
from xgboost.sklearn import XGBRegressor
regr = XGBRegressor(verbosity=0)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
rand_pred = regr.predict(rand_X)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))
print(f"Baseline r2: {skl.metrics.r2_score(y_test,rand_pred)}")

Mean squared error: 4.08
Coefficient of determination (r2): 0.61
Baseline r2: -3.1948970085907638
