In [1]:
import pandas as pd
import numpy as np
import nltk
import data_reader
import results_analyser

In [2]:
LEXICON_DIR = "./custom_lexicons/stocktwitlexi/"
LEXICON_PATHS = ["domain_lexicon_norm.csv",
                 "domain_lexicon_std.csv",
                 "domain_lexicon_raw.csv",
                 "domain_lexicon_raw_norm.csv",
                 "domain_lexicon_raw_std.csv"]

In [3]:
def stocktwitlexi_processor(X, stocktwitlexi):
    stocktwitlexi_pred_raw = []

    for test_tweet in X:
        stocktwitlexi_score = 0
        sentence_tagged = np.array(nltk.pos_tag(test_tweet))
        for tagged in sentence_tagged:
            word = tagged[0]
            try: 
                stocktwitlexi_score += stocktwitlexi[word]
            except:
                pass
        stocktwitlexi_pred_raw.append(stocktwitlexi_score)
    
    return stocktwitlexi_pred_raw

def generate_results(X, y_class):
    results_df = pd.DataFrame()

    for lexicon_path in LEXICON_PATHS:
        stocktwitlexi = pd.read_csv(LEXICON_DIR+lexicon_path, header=None, index_col=0)
        stocktwitlexi = stocktwitlexi.to_dict()[1]
        stocktwitlexi_pred = stocktwitlexi_processor(X, stocktwitlexi)
        pred_class = results_analyser.probability_to_class(stocktwitlexi_pred)
        results_df = results_analyser.calculate_metrics(results_df, y_class, pred_class, lexicon_path)

    return results_df

## Data 1

In [11]:
data1_X, data1_y_class = data_reader.read_data1("list")
generate_results(data1_X, data1_y_class)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.691547,domain_lexicon_norm.csv,0.804143,0.6832,0.977117
1,0.575877,domain_lexicon_std.csv,0.569277,0.832599,0.432494
2,0.641127,domain_lexicon_raw.csv,0.688412,0.787046,0.611747
3,0.695502,domain_lexicon_raw_norm.csv,0.805923,0.686527,0.975591
4,0.644093,domain_lexicon_raw_std.csv,0.694656,0.782235,0.624714


## Data 2

In [12]:
data2_X, data2_y_class = data_reader.read_data2("list")
generate_results(data2_X, data2_y_class)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.613714,domain_lexicon_norm.csv,0.752199,0.612903,0.973435
1,0.533714,domain_lexicon_std.csv,0.434903,0.805128,0.297913
2,0.626286,domain_lexicon_raw.csv,0.644178,0.755102,0.56167
3,0.620571,domain_lexicon_raw_norm.csv,0.755882,0.617047,0.975332
4,0.633143,domain_lexicon_raw_std.csv,0.655209,0.75495,0.578748


## Data 3

In [13]:
data3_X, data3_y_class = data_reader.read_data3("list")
generate_results(data3_X, data3_y_class)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.641688,domain_lexicon_norm.csv,0.75814,0.62572,0.961652
1,0.69509,domain_lexicon_std.csv,0.687831,0.855263,0.575221
2,0.740741,domain_lexicon_raw.csv,0.765027,0.812604,0.722714
3,0.638243,domain_lexicon_raw_norm.csv,0.75553,0.624038,0.957227
4,0.744186,domain_lexicon_raw_std.csv,0.770302,0.809756,0.734513


## Combined

In [15]:
combined_X = data1_X
combined_X.extend(data2_X)
combined_X.extend(data3_X)

combined_y_class = data1_y_class
combined_y_class.extend(data2_y_class)
combined_y_class.extend(data3_y_class)

print(len(combined_X), len(combined_y_class))

4059 4059


In [16]:
generate_results(combined_X, combined_y_class)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.660508,domain_lexicon_norm.csv,0.780223,0.651572,0.972178
1,0.600887,domain_lexicon_std.csv,0.579002,0.836336,0.442766
2,0.66642,domain_lexicon_raw.csv,0.701104,0.788481,0.631161
3,0.662971,domain_lexicon_raw_norm.csv,0.78119,0.65364,0.970588
4,0.670362,domain_lexicon_raw_std.csv,0.707988,0.785092,0.644674
