In [2]:
import json
import matplotlib as mpl
import numpy as np
import sklearn as skl

## Read and preprocess data from json

In [3]:
sent_f = open("data/sentiment_data.json")
twitter_f = open("data/tweets.json")
google_dict = json.load(sent_f)
twitter_dict = json.load(twitter_f)

In [4]:
Xs = np.array(list(twitter_dict.values()))
Ys = []
for id in google_dict.keys():
    Ys.append(google_dict[id]["Score"])
Ys = np.array(Ys)
Ys = (Ys*10).astype(int)

#### Create train test split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, Ys, test_size=0.15)

#### Vectorize strings so they can go into models

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

#### Create baseline data set

In [7]:
rand_X = np.random.randint(low=-10, high=10, size=np.shape(X_test))

## Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(n_estimators = 100, random_state = 0)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
rand_pred = regr.predict(rand_X)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))
print(f"Baseline r2: {skl.metrics.r2_score(y_test,rand_pred)}")

Mean squared error: 4.26
Coefficient of determination (r2): 0.68
Baseline r2: -0.3751771094460481


#### Lets try some grid search

In [11]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 100, 1000),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
    
grid_result = gsc.fit(X_train, y_train)
best_params = grid_result.best_params_
regr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"], random_state=False, verbose=False)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
rand_pred = regr.predict(rand_X)

print("Mean squared error: %.2f" % skl.metrics.mean_squared_error(y_test, y_pred))
print("Coefficient of determination (r2): %.2f" % skl.metrics.r2_score(y_test, y_pred))
print(f"Baseline r2: {skl.metrics.r2_score(y_test,rand_pred)}")

Mean squared error: 6.42
Coefficient of determination (r2): 0.52
Baseline r2: -0.4058444825098173
