In [1]:
import numpy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import tree
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import svm
from sklearn import linear_model
from sklearn import ensemble
from sklearn import neural_network
from sklearn import feature_selection
from dataset_preperation_modules import RawFeatureToBucketize, BucketFeatureExtractor

# Preaparing Test Data for further processing

In [2]:
from sklearn.naive_bayes import MultinomialNB

In [3]:
from sklearn.feature_selection import SelectFromModel

In [4]:
test_vectorizer = CountVectorizer()

test_file_name = 'test_tweets_preprocessed.txt'

test_sentence_list = list()
test_score_list = list()

In [5]:
with open(test_file_name, 'r', encoding='utf8') as test_file:
    for line in test_file:
        splitted_line = line.split(' ')
        splitted_line.pop()
        score = float(splitted_line[-1])
        test_score_list.append(score)
        splitted_line.pop()
        sentence = ' '.join(splitted_line)
        test_sentence_list.append(sentence)

In [6]:
 #Preparing Training Data for further processing

vectorizer = CountVectorizer()

file_name = 'train_tweets_preprocessed.txt'

sentence_list = list()
score_list = list()

with open(file_name, 'r', encoding='utf8') as file:
    for line in file:
        splitted_line = line.split(' ')
        splitted_line.pop()
        score = float(splitted_line[-1])
        score_list.append(score)
        splitted_line.pop()
        sentence = ' '.join(splitted_line)
        sentence_list.append(sentence)

In [7]:
score_array = numpy.array(score_list)
test_score_array = numpy.array(test_score_list)

In [8]:
feature_array = vectorizer.fit_transform(sentence_list)
test_feature_array = vectorizer.transform(test_sentence_list)

In [9]:
print(feature_array.shape)
print(test_feature_array.shape)

(757, 1361)
(200, 1361)


In [10]:
tfidf_transformer = TfidfTransformer()

feature_array = tfidf_transformer.fit_transform(feature_array)

In [11]:
test_feature_array = tfidf_transformer.transform(test_feature_array)

In [12]:
feature_array.shape

(757, 1361)

In [13]:
test_feature_array.shape

(200, 1361)

In [14]:
# Bucketing features into 7 distinct buckets from negative to positive 1-7

features_to_bucketize = dict()

for feature_name in vectorizer.get_feature_names():
    features_to_bucketize[feature_name] = RawFeatureToBucketize(feature_name)


for sentence, score in zip(sentence_list, score_list):
    sentence = sentence.split(' ')

    for word in sentence:
        if word in features_to_bucketize:
            features_to_bucketize[word].add_score(score)

In [15]:
# Bucketing operation

word_sentiment_dictionary = dict()

for feature_name in features_to_bucketize:
    if 1.0 >= features_to_bucketize[feature_name].get_average_score() >= 0.5:
        word_sentiment_dictionary[feature_name] = 'VeryPositive'
    elif 0.5 > features_to_bucketize[feature_name].get_average_score() >= 0.0:
        word_sentiment_dictionary[feature_name] = 'Positive'
    elif 0.0 > features_to_bucketize[feature_name].get_average_score() >= -0.25:
        word_sentiment_dictionary[feature_name] = 'Neutral'
    elif -0.25 > features_to_bucketize[feature_name].get_average_score() >= -0.5:
        word_sentiment_dictionary[feature_name] = 'MinorNegative'
    elif -0.5 > features_to_bucketize[feature_name].get_average_score() >= -0.75:
        word_sentiment_dictionary[feature_name] = 'Negative'
    elif -0.75 > features_to_bucketize[feature_name].get_average_score() >= -0.90:
        word_sentiment_dictionary[feature_name] = 'VeryNegative'
    elif -0.90 > features_to_bucketize[feature_name].get_average_score() >= -1:
        word_sentiment_dictionary[feature_name] = 'ExtremelyNegative'

In [16]:
# Average score of the Training Set

score_total = 0
number_of_scores = 0

for score in score_list:
    number_of_scores += 1
    score_total += score

print(score_total / number_of_scores)

-0.3011889035667103


In [17]:
# Print Dictionary

print(word_sentiment_dictionary)

{'00': 'Neutral', '000': 'Positive', '01': 'Positive', '015': 'Positive', '018471': 'Positive', '02': 'Positive', '0212': 'Negative', '028': 'Negative', '03': 'Positive', '05': 'Positive', '050583': 'MinorNegative', '050630': 'Positive', '053277': 'Positive', '0535': 'Negative', '053841': 'Positive', '053888': 'Positive', '053979': 'Positive', '0545': 'Positive', '054561': 'Positive', '08': 'ExtremelyNegative', '09': 'ExtremelyNegative', '10': 'Neutral', '100': 'Positive', '100112': 'MinorNegative', '11': 'MinorNegative', '115': 'ExtremelyNegative', '12': 'MinorNegative', '120': 'Negative', '13': 'MinorNegative', '1309': 'VeryPositive', '14': 'Positive', '1415': 'Positive', '15': 'MinorNegative', '16': 'MinorNegative', '17': 'MinorNegative', '175': 'ExtremelyNegative', '18': 'Positive', '19': 'Neutral', '20': 'Negative', '2000': 'MinorNegative', '2004': 'Negative', '2009': 'VeryNegative', '2013': 'ExtremelyNegative', '2014': 'Positive', '212318': 'ExtremelyNegative', '212596': 'Positiv

In [18]:
for word in word_sentiment_dictionary:
    print(word + ": " + word_sentiment_dictionary[word])

00: Neutral
000: Positive
01: Positive
015: Positive
018471: Positive
02: Positive
0212: Negative
028: Negative
03: Positive
05: Positive
050583: MinorNegative
050630: Positive
053277: Positive
0535: Negative
053841: Positive
053888: Positive
053979: Positive
0545: Positive
054561: Positive
08: ExtremelyNegative
09: ExtremelyNegative
10: Neutral
100: Positive
100112: MinorNegative
11: MinorNegative
115: ExtremelyNegative
12: MinorNegative
120: Negative
13: MinorNegative
1309: VeryPositive
14: Positive
1415: Positive
15: MinorNegative
16: MinorNegative
17: MinorNegative
175: ExtremelyNegative
18: Positive
19: Neutral
20: Negative
2000: MinorNegative
2004: Negative
2009: VeryNegative
2013: ExtremelyNegative
2014: Positive
212318: ExtremelyNegative
212596: Positive
23: Positive
24: ExtremelyNegative
25: ExtremelyNegative
2986: MinorNegative
30: MinorNegative
300: ExtremelyNegative
333: Negative
334: Positive
34: MinorNegative
36: Positive
386312: MinorNegative
41: Negative
410: MinorNegat

In [19]:
training_data_bucket_feature_extractor = BucketFeatureExtractor(word_sentiment_dictionary,
                                                                features_to_bucketize,
                                                                sentence_list,
                                                                score_list)

In [20]:
training_data_extracted_features_x = training_data_bucket_feature_extractor.extract_features()
training_data_extracted_features_x = numpy.array(training_data_extracted_features_x)
print(training_data_extracted_features_x)

[[ 0.          0.          0.         ...,  0.          0.         -0.13056261]
 [ 0.          0.          0.         ...,  0.          0.         -0.37153547]
 [ 1.          0.          0.         ...,  0.          0.         -0.2648446 ]
 ..., 
 [ 0.          0.          0.         ...,  2.          2.         -0.68920274]
 [ 0.          1.          0.         ...,  0.          0.         -0.53967993]
 [ 0.          0.          0.         ...,  0.          1.         -0.41639407]]


In [21]:
# Feature Extraction for the Test Data using the Buckets and pre-processed data

test_data_bucket_feature_extractor = BucketFeatureExtractor(word_sentiment_dictionary,
                                                            features_to_bucketize,
                                                            test_sentence_list,
                                                            test_score_list)

test_data_extracted_features_x = test_data_bucket_feature_extractor.extract_features()
test_data_extracted_features_x = numpy.array(test_data_extracted_features_x)
print(test_data_extracted_features_x)

[[ 1.          0.          0.         ...,  0.          0.         -0.37644361]
 [ 0.          0.          0.         ...,  0.          0.         -0.03146927]
 [ 0.          0.          0.         ...,  0.          0.         -0.24430867]
 ..., 
 [ 0.          0.          0.         ...,  1.          0.         -0.44360345]
 [ 0.          0.          0.         ...,  0.          0.          0.06276042]
 [ 0.          0.          0.         ...,  0.          0.         -0.23870191]]


In [22]:
test_data_extracted_features_x

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.37644361],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.03146927],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.24430867],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        , -0.44360345],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.06276042],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.23870191]])

In [23]:
# Decision Tree Regression

decision_tree_regressor = tree.DecisionTreeRegressor(max_depth=7, min_samples_leaf=17)
decision_tree_regressor.fit(training_data_extracted_features_x, score_array)

# Test Decision Tree Regression on Test Data

print('Decision Tree Regression score: '
      + str(decision_tree_regressor.score(test_data_extracted_features_x, test_score_array)))
# Score: -0.759753880839
# After adjusting values, score improved, but still very bad. Score: -0.193423357515

# Get Feature Importances of Decision Tree

# print(decision_tree_regressor.feature_importances_)

# Try Regression based on k-nearest neighbors.

Decision Tree Regression score: -0.0311006708616


In [24]:
# Try Regression based on k-nearest neighbors.

k_nearest_neighbors_regressor = neighbors.KNeighborsRegressor(n_neighbors=248, p=1)
k_nearest_neighbors_regressor.fit(training_data_extracted_features_x, score_array)

# Test K-Nearest Neighbours Regression on Test Data

print('K-Nearest Neighbours Regression score: '
      + str(k_nearest_neighbors_regressor.score(test_data_extracted_features_x, test_score_array)))
# Score: -0.0961322606566
# When I changed n_neighbors from default of 5 to 248 score increased dramatically
# Changing algorithms didn't have anny effect
# Score: 0.129813702549
# When I changed p to 1 from the default value of 2, score increased again
# Score: 0.161999595131

K-Nearest Neighbours Regression score: 0.211669259898


In [25]:
# Try Support Vector Machine Regression

support_vector_machine_regressor = svm.SVR(kernel='linear', epsilon=0.2)
support_vector_machine_regressor.fit(training_data_extracted_features_x, score_array)

# Test Support Vector Machine Regression on Test Data

print('Support Vector Machine Regressor Score: '
      + str(support_vector_machine_regressor.score(test_data_extracted_features_x, test_score_array)))
# Score: -0.101547661324
# after changing the kernel to 'linear', score improved.
# Score: 0.117806253477
# after changing epsilon value to 0.2, score improved again
# Score: 0.133648477067


Support Vector Machine Regressor Score: 0.277475962127


In [26]:
# Try Stochastic Gradient Descent Regression

stochastic_gradient_descent_regressor = linear_model.SGDRegressor(tol=0.01)
stochastic_gradient_descent_regressor.fit(training_data_extracted_features_x, score_array)

# Test Stochastic Gradient Descent Regression on Test Data

print('Stochastic Gradient Descent Regressor Score: '
      + str(stochastic_gradient_descent_regressor.score(test_data_extracted_features_x, test_score_array)))
# Score: 0.176217008989

Stochastic Gradient Descent Regressor Score: 0.258230410913


In [27]:
# Try Random Forests Regression

random_forests_regressor = ensemble.RandomForestRegressor(max_depth=7, min_samples_leaf=17)
random_forests_regressor.fit(training_data_extracted_features_x, score_array)

# Test Random Forest Regression on Test Data

print('Random Forests Regressor score: ' + str(random_forests_regressor.score(test_data_extracted_features_x,
                                                                              test_score_array)))
# Score: -0.334794427196
# After changing parameters, score improved but still very bad
# Score: -0.193607366878

Random Forests Regressor score: -0.00112699961601


In [28]:
# Try Multi-layer Perceptron Regression

multi_layer_perceptron_regressor = neural_network.MLPRegressor(solver='sgd')
multi_layer_perceptron_regressor.fit(training_data_extracted_features_x, score_array)

# Test Multi-layer Perceptron on Test Data

print('Multi-layer Perceptron score: ' + str(multi_layer_perceptron_regressor.score(test_data_extracted_features_x,
                                                                                    test_score_array)))
# Score: 0.00620729465159
# It got a lot better after I changed to solver to 'sgd': stochastic gradient descent.
# Score: 0.154938217153

Multi-layer Perceptron score: 0.239079436605


In [29]:
# Try Recursive Feature Selection
estimator = svm.SVR(kernel="linear")
recursive_feature_selection = feature_selection.RFE(estimator)
recursive_feature_selection.fit(training_data_extracted_features_x, score_array)

# Test Recursive Feature Selection on Test Data

print('Recursive Feature Selection score. ' + str(recursive_feature_selection.score(test_data_extracted_features_x,
                                                                                    test_score_array)))
# Score: 0.0631286115014

Recursive Feature Selection score. 0.226363259055


In [30]:
# Try Ridge Regression

ridge_regressor = linear_model.Ridge(solver='auto', normalize=True)
ridge_regressor.fit(training_data_extracted_features_x, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(test_data_extracted_features_x, test_score_array)))
# Score: 0.140327821678
# It slightly improved when I changed the solver to saga (Stochastic Average Gradient Descent)
# When I turned on normalize ,after changing the solver to saga, score improved; however before that, it didn't help.
# Score: 0.191242391892

Ridge Regressor score: 0.296883000765


In [31]:
# Todo: Merge features from the Excel file with the generated features
# Todo: Run all the Regressors and Classifiers with the dataset containing combined features
# Todo: Try Naive Bayes Classifiers from the Scikit Learn
# Todo: (Gaussian Naive Bayes, Multinomial Naive Bayes, Bernoulli Naive Bayes)

In [32]:
import pandas as pd

In [33]:
trainExcelFileName = 'train21-Onlyfeatures.xlsx'

In [34]:
trainExcel = pd.ExcelFile(trainExcelFileName)

In [35]:
print(trainExcel.sheet_names)

['Bank_Train']


In [36]:
Traindf = trainExcel.parse('Bank_Train')

In [37]:
Traindf.columns

Index(['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11',
       'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20', 'F21'],
      dtype='object')

In [38]:
Traindf.astype(float)

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21
0,0.104344,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,7.0,0.424286,0.880000
1,-0.199180,0.0,0.0,0.0,0.0,2.0,0.0,7.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,13.0,13.0,0.421538,0.897692
2,-0.237453,0.0,0.0,0.0,0.0,1.0,0.0,7.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,6.0,9.0,0.411111,0.833333
3,-0.201703,0.0,0.0,0.0,0.0,4.0,0.0,8.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,10.0,14.0,0.375714,0.845000
4,-0.227058,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,7.0,5.0,0.390000,0.834286
5,-0.253008,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,8.0,0.422500,0.734000
6,0.000000,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.436667,0.920000
7,-0.231702,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,4.0,7.0,0.364286,0.907500
8,-0.126504,0.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,3.0,5.0,0.432000,0.926667
9,-0.136235,0.0,0.0,1.0,1.0,1.0,0.0,8.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,13.0,10.0,0.448000,0.835385


In [39]:
trainExcelNPArray = Traindf.as_matrix()

In [40]:
trainAllFeatures=numpy.hstack((trainExcelNPArray,training_data_extracted_features_x))

In [41]:
trainAllFeatures.shape

(757, 33)

In [42]:
testExcelFileName = 'test-21-Onlyfeatures.xlsx'

In [43]:
testExcel = pd.ExcelFile(testExcelFileName)

In [44]:
print(testExcel.sheet_names)

['Bank_Test']


In [45]:
Testdf = testExcel.parse('Bank_Test')

In [46]:
Testdf.astype(float)

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21
0,-0.105420,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,7.0,0.428571,7.860000e-01
1,-0.089180,0.0,0.0,0.0,0.0,2.0,0.0,6.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,7.0,9.0,0.418889,8.671429e-01
2,-0.107677,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,4.0,8.0,0.380000,8.825000e-01
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.436667,9.200000e-01
4,-0.084884,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,6.0,9.0,0.428889,8.350000e-01
5,-0.148431,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.330000,9.200000e-01
6,-0.075902,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,...,2.0,0.0,0.0,0.0,0.0,1.0,6.0,4.0,0.425000,8.883333e-01
7,-0.093208,0.0,0.0,0.0,0.0,1.0,0.0,8.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,8.0,14.0,0.370000,8.687500e-01
8,-0.119604,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,9.0,7.0,0.474286,8.833333e-01
9,-0.001278,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,0.476000,8.875000e-01


In [47]:
testExcelNPArray = Testdf.as_matrix()

In [48]:
testExcelNPArray.shape

(200, 21)

In [49]:
testFeaturesPruned = test_data_extracted_features_x[0:200]

In [50]:
testFeaturesPruned.shape

(200, 12)

In [51]:
testScorePruned = test_score_array[0:200]

In [52]:
testAllFeatures=numpy.hstack((testExcelNPArray,testFeaturesPruned))

In [53]:
testAllFeatures.shape

(200, 33)

In [54]:
# Decision Tree Regression

decision_tree_regressor = tree.DecisionTreeRegressor(max_depth=7, min_samples_leaf=17)
decision_tree_regressor.fit(trainAllFeatures, score_array)

# Test Decision Tree Regression on Test Data

print('Decision Tree Regression score: '
      + str(decision_tree_regressor.score(testAllFeatures, testScorePruned)))
# Score: -0.759753880839
# After adjusting values, score improved, but still very bad. Score: -0.193423357515

# Get Feature Importances of Decision Tree

# print(decision_tree_regressor.feature_importances_)

# Try Regression based on k-nearest neighbors.

Decision Tree Regression score: -0.059591080655


In [55]:
#Try Regression based on k-nearest neighbors.

k_nearest_neighbors_regressor = neighbors.KNeighborsRegressor(n_neighbors=75, p=1)
k_nearest_neighbors_regressor.fit(trainAllFeatures, score_array)

# Test K-Nearest Neighbours Regression on Test Data

print('K-Nearest Neighbours Regression score: '
      + str(k_nearest_neighbors_regressor.score(testAllFeatures, testScorePruned)))
# Score: -0.0961322606566
# When I changed n_neighbors from default of 5 to 248 score increased dramatically Yet changing n_neighbors to 75 increased the score
# Changing algorithms didn't have anny effect
# Score: 0.129813702549
# When I changed p to 1 from the default value of 2, score increased again
# Score: 0.161999595131

K-Nearest Neighbours Regression score: 0.201266713057


In [56]:
# # Try Support Vector Machine Regression

# support_vector_machine_regressor = svm.LinearSVR(kernel='linear', epsilon=0.2,cache_size=7000)
# support_vector_machine_regressor.fit(trainAllFeatures, score_array)

# # Test Support Vector Machine Regression on Test Data

# print('Support Vector Machine Regressor Score: '
#       + str(support_vector_machine_regressor.score(testAllFeatures, testScorePruned)))
# # Score: -0.101547661324
# # after changing the kernel to 'linear', score improved.
# # Score: 0.117806253477
# # after changing epsilon value to 0.2, score improved again
# # Score: 0.133648477067

In [57]:
# Try Stochastic Gradient Descent Regression

stochastic_gradient_descent_regressor = linear_model.SGDRegressor(tol=1e-3)
stochastic_gradient_descent_regressor.fit(trainAllFeatures, score_array)

# Test Stochastic Gradient Descent Regression on Test Data

print('Stochastic Gradient Descent Regressor Score: '
      + str(stochastic_gradient_descent_regressor.score(testAllFeatures, testScorePruned)))
# Score: 0.176217008989

Stochastic Gradient Descent Regressor Score: -1.44070301674e+68


In [58]:
# Try Random Forests Regression

random_forests_regressor = ensemble.RandomForestRegressor(max_depth=7, min_samples_leaf=17)
random_forests_regressor.fit(trainAllFeatures, score_array)

# Test Random Forest Regression on Test Data

print('Random Forests Regressor score: ' + str(random_forests_regressor.score(testAllFeatures,
                                                                              testScorePruned)))

Random Forests Regressor score: 0.00384244734706


In [59]:
# Try Multi-layer Perceptron Regression

multi_layer_perceptron_regressor = neural_network.MLPRegressor(solver='sgd')
multi_layer_perceptron_regressor.fit(trainAllFeatures, score_array)

# Test Multi-layer Perceptron on Test Data

print('Multi-layer Perceptron score: ' + str(multi_layer_perceptron_regressor.score(testAllFeatures,
                                                                                    testScorePruned)))
# Score: 0.00620729465159
# It got a lot better after I changed to solver to 'sgd': stochastic gradient descent.
# Score: 0.154938217153

  return ((y_true - y_pred) ** 2).mean() / 2


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# # Try Recursive Feature Selection
# estimator = svm.SVR(kernel="linear")
# recursive_feature_selection = feature_selection.RFE(estimator)
# recursive_feature_selection.fit(trainAllFeatures, score_array)

# # Test Recursive Feature Selection on Test Data

# print('Recursive Feature Selection score. ' + str(recursive_feature_selection.score(testAllFeatures,
#                                                                                     testScorePruned)))
# # Score: 0.0631286115014

In [108]:
# Try Ridge Regression

ridge_regressor = linear_model.Ridge(solver='sag', normalize=True)
ridge_regressor.fit(trainAllFeatures, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(testAllFeatures, testScorePruned)))
# Score: 0.140327821678
# It slightly improved when I changed the solver to saga (Stochastic Average Gradient Descent)
# When I turned on normalize ,after changing the solver to saga, score improved; however before that, it didn't help.
# Score: 0.191242391892
predictedListOne = ridge_regressor.predict(testAllFeatures)

Ridge Regressor score: 0.299447255872


In [61]:
# Try Bayesian Ridge Regression

ridge_regressor = linear_model.BayesianRidge(normalize=True)
ridge_regressor.fit(trainAllFeatures, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(testAllFeatures, testScorePruned)))

Ridge Regressor score: 0.247688599071


In [62]:
# Try ensemble   Regression

ensemble_regressor = ensemble.AdaBoostRegressor()
ensemble_regressor.fit(trainAllFeatures, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeatures, testScorePruned)))

ensemble Regressor score: 0.00884248453279


In [63]:
# Try ensemble   Regression

ensemble_regressor = ensemble.BaggingRegressor()
ensemble_regressor.fit(trainAllFeatures, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeatures, testScorePruned)))

ensemble Regressor score: -0.134421059862


In [64]:
# Try ensemble   Regression

ensemble_regressor = ensemble.ExtraTreesRegressor()
ensemble_regressor.fit(trainAllFeatures, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeatures, testScorePruned)))

ensemble Regressor score: -0.121393510604


In [65]:
# Try ensemble   Regression

ensemble_regressor = ensemble.GradientBoostingRegressor()
ensemble_regressor.fit(trainAllFeatures, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeatures, testScorePruned)))

ensemble Regressor score: -0.102476098383


In [66]:
# Try ensemble   Regression

ensemble_regressor = ensemble.RandomForestRegressor()
ensemble_regressor.fit(trainAllFeatures, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeatures, testScorePruned)))

ensemble Regressor score: -0.126840546635


In [67]:
###########************REGRESSION WITH 21 FEATURES STARTS HERE ********************************#################

In [68]:
# testExcelNPArray.shape      testScorePruned

# trainExcelNPArray    score_array

In [69]:
# Decision Tree Regression

decision_tree_regressor = tree.DecisionTreeRegressor(max_depth=7, min_samples_leaf=17)
decision_tree_regressor.fit(trainExcelNPArray, score_array)

# Test Decision Tree Regression on Test Data

print('Decision Tree Regression score: '
      + str(decision_tree_regressor.score(testExcelNPArray, testScorePruned)))
# Score: -0.759753880839
# After adjusting values, score improved, but still very bad. Score: -0.193423357515 -> with only extracted features

# with only given features it is better but still the score values is negative

# Get Feature Importances of Decision Tree

# print(decision_tree_regressor.feature_importances_)

# Try Regression based on k-nearest neighbors.

Decision Tree Regression score: -0.149098427286


In [70]:
# Try Regression based on k-nearest neighbors.

k_nearest_neighbors_regressor = neighbors.KNeighborsRegressor(n_neighbors=248, p=1)
k_nearest_neighbors_regressor.fit(trainExcelNPArray, score_array)

# Test K-Nearest Neighbours Regression on Test Data

print('K-Nearest Neighbours Regression score: '
      + str(k_nearest_neighbors_regressor.score(testExcelNPArray, testScorePruned)))
# Score: -0.0961322606566
# When I changed n_neighbors from default of 5 to 248 score increased dramatically
# Changing algorithms didn't have anny effect
# Score: 0.129813702549
# When I changed p to 1 from the default value of 2, score increased again
# Score: 0.161999595131

# with only given 21 features the score is slighly worse 

K-Nearest Neighbours Regression score: 0.110167788494


In [71]:
# # Try Support Vector Machine Regression

# support_vector_machine_regressor = svm.SVR(kernel='linear', epsilon=0.2)
# support_vector_machine_regressor.fit(trainExcelNPArray, score_array)

# # Test Support Vector Machine Regression on Test Data

# print('Support Vector Machine Regressor Score: '
#       + str(support_vector_machine_regressor.score(testExcelNPArray, test_score_array)))
# # Score: -0.101547661324
# # after changing the kernel to 'linear', score improved.
# # Score: 0.117806253477
# # after changing epsilon value to 0.2, score improved again
# # Score: 0.133648477067

In [72]:
# Try Stochastic Gradient Descent Regression

stochastic_gradient_descent_regressor = linear_model.SGDRegressor(tol=1e-3)
stochastic_gradient_descent_regressor.fit(trainExcelNPArray, score_array)

# Test Stochastic Gradient Descent Regression on Test Data

print('Stochastic Gradient Descent Regressor Score: '
      + str(stochastic_gradient_descent_regressor.score(testExcelNPArray, testScorePruned)))
# Score: 0.176217008989

Stochastic Gradient Descent Regressor Score: -7.49631133379e+68


In [73]:
# Try Random Forests Regression

random_forests_regressor = ensemble.RandomForestRegressor(max_depth=7, min_samples_leaf=17)
random_forests_regressor.fit(trainExcelNPArray, score_array)

# Test Random Forest Regression on Test Data

print('Random Forests Regressor score: ' + str(random_forests_regressor.score(testExcelNPArray,
                                                                              testScorePruned)))
# Score: -0.334794427196
# After changing parameters, score improved but still very bad
# Score: -0.193607366878

Random Forests Regressor score: -0.00608422740509


In [74]:
# Try Multi-layer Perceptron Regression

multi_layer_perceptron_regressor = neural_network.MLPRegressor(solver='sgd')
multi_layer_perceptron_regressor.fit(trainExcelNPArray, score_array)

# Test Multi-layer Perceptron on Test Data

print('Multi-layer Perceptron score: ' + str(multi_layer_perceptron_regressor.score(testExcelNPArray,
                                                                                    testScorePruned)))
# Score: 0.00620729465159
# It got a lot better after I changed to solver to 'sgd': stochastic gradient descent.
# Score: 0.154938217153

Multi-layer Perceptron score: -2.28490127452e+171


In [75]:
# Try Ridge Regression

ridge_regressor = linear_model.Ridge(solver='auto', normalize=True)
ridge_regressor.fit(trainExcelNPArray, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(testExcelNPArray, testScorePruned)))
# Score: 0.140327821678
# It slightly improved when I changed the solver to saga (Stochastic Average Gradient Descent)
# When I turned on normalize ,after changing the solver to saga, score improved; however before that, it didn't help.
# Score: 0.191242391892

Ridge Regressor score: 0.146191396497


In [76]:
# Try Bayesian Ridge Regression

ridge_regressor = linear_model.BayesianRidge(normalize=True)
ridge_regressor.fit(trainExcelNPArray, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(testExcelNPArray, testScorePruned)))

Ridge Regressor score: 0.0310530631153


In [77]:
####################***********FEATURE SElECTION FROM BAG OF WORDS MODEL ***************###########################

In [78]:
# feature_array.shape  // score_array
# test_feature_array -> take the first 200 samples // testScorePruned
# model = SelectFromModel(reg, prefit=True, threshold=0.05)

In [79]:
test_feature_array = test_feature_array[0:200]

In [80]:
ridge_regressor = linear_model.Ridge(solver='auto', normalize=True)
ridge_regressor.fit(feature_array, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(test_feature_array, testScorePruned)))

Ridge Regressor score: 0.216133793898


In [81]:
model = SelectFromModel(ridge_regressor, prefit=True, threshold="2*mean")

In [82]:
selectedFeaturesFromBagOfWords = model.transform(feature_array)

In [83]:
selectedTestFeaturesFromBagOfWords = model.transform(test_feature_array)

In [84]:
selectedFeaturesFromBagOfWords.shape

(757, 146)

In [85]:
selectedTestFeaturesFromBagOfWords.shape

(200, 146)

In [86]:
ridge_regressor = linear_model.Ridge(solver='auto', normalize=True)
ridge_regressor.fit(selectedFeaturesFromBagOfWords, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score with selected features from bag of words: ' + str(ridge_regressor.score(selectedTestFeaturesFromBagOfWords, testScorePruned)))

Ridge Regressor score with selected features from bag of words: 0.101547209722


In [87]:
##############************* LET'S COMBINE ALL THE FEATURES *********************######################

In [88]:
selectedFeaturesFromBagOfWordsArray =selectedFeaturesFromBagOfWords.toarray()

In [89]:
selectedTestFeaturesFromBagOfWordsArray = selectedTestFeaturesFromBagOfWords.toarray()

In [90]:
trainAllFeaturesCombined=numpy.hstack((trainAllFeatures,selectedFeaturesFromBagOfWordsArray))

In [91]:
testAllFeaturesCombined=numpy.hstack((testAllFeatures,selectedTestFeaturesFromBagOfWordsArray))

In [92]:
trainAllFeaturesCombined.shape

(757, 179)

In [93]:
testAllFeaturesCombined.shape

(200, 179)

In [94]:
#########################*********** REGRESSION USING ALL THE FEATURES COMBINED *****************#################33

In [95]:
# Decision Tree Regression

decision_tree_regressor = tree.DecisionTreeRegressor(max_depth=7, min_samples_leaf=17)
decision_tree_regressor.fit(trainAllFeaturesCombined, score_array)

# Test Decision Tree Regression on Test Data

print('Decision Tree Regression score: '
      + str(decision_tree_regressor.score(testAllFeaturesCombined, testScorePruned)))

Decision Tree Regression score: -0.107075691865


In [96]:
# Try Regression based on k-nearest neighbors.

k_nearest_neighbors_regressor = neighbors.KNeighborsRegressor(n_neighbors=65, p=1)
k_nearest_neighbors_regressor.fit(trainAllFeaturesCombined, score_array)

# Test K-Nearest Neighbours Regression on Test Data

print('K-Nearest Neighbours Regression score: '
      + str(k_nearest_neighbors_regressor.score(testAllFeaturesCombined, testScorePruned)))

K-Nearest Neighbours Regression score: 0.213326574512


In [97]:
# Try Stochastic Gradient Descent Regression

stochastic_gradient_descent_regressor = linear_model.SGDRegressor(tol=1e-3)
stochastic_gradient_descent_regressor.fit(trainAllFeaturesCombined, score_array)

# Test Stochastic Gradient Descent Regression on Test Data

print('Stochastic Gradient Descent Regressor Score: '
      + str(stochastic_gradient_descent_regressor.score(testAllFeaturesCombined, testScorePruned)))

Stochastic Gradient Descent Regressor Score: -1.27924009544e+69


In [98]:
# Try Random Forests Regression

random_forests_regressor = ensemble.RandomForestRegressor(max_depth=10, min_samples_leaf=8)
random_forests_regressor.fit(trainAllFeaturesCombined, score_array)

# Test Random Forest Regression on Test Data

print('Random Forests Regressor score: ' + str(random_forests_regressor.score(testAllFeaturesCombined,
                                                                              testScorePruned)))

Random Forests Regressor score: 0.0225786670209


In [99]:
# Try Multi-layer Perceptron Regression

multi_layer_perceptron_regressor = neural_network.MLPRegressor(solver='sgd',hidden_layer_sizes=(100000,1))
multi_layer_perceptron_regressor.fit(trainAllFeaturesCombined, score_array)

# Test Multi-layer Perceptron on Test Data

print('Multi-layer Perceptron score: ' + str(multi_layer_perceptron_regressor.score(testAllFeaturesCombined,
                                                                                    testScorePruned)))

Multi-layer Perceptron score: -0.0118450294104


In [107]:
# Try Ridge Regression

ridge_regressor = linear_model.Ridge(solver='auto', normalize=True)
ridge_regressor.fit(trainAllFeaturesCombined, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(testAllFeaturesCombined, testScorePruned)))
print('Ridge Regressor prediction: ' + str(ridge_regressor.predict(testAllFeaturesCombined)))
predictedListTwo = ridge_regressor.predict(testAllFeaturesCombined)

Ridge Regressor score: 0.231409345332
Ridge Regressor prediction: [ -2.42420719e-01   3.52242512e-01   1.06678714e-02   2.56871959e-02
  -3.03343202e-01  -3.31628729e-01  -4.60773307e-01  -1.74970400e-01
   1.19400671e-01   2.75761079e-01  -1.68669255e-01  -6.48985401e-02
  -1.63135767e-01  -1.27081305e-01  -5.63680686e-01  -1.33573550e-01
  -4.40413132e-01   2.72971768e-02   2.60984523e-03  -2.16539267e-01
  -8.22508793e-01  -1.77730376e-01  -7.03493262e-01  -6.73511321e-01
  -1.47950421e-02  -8.40664909e-01  -2.70622648e-01  -2.05993086e-01
  -4.50204024e-01  -5.21952630e-01  -5.94716148e-01   5.46971545e-01
  -5.94041804e-01  -3.30822695e-01  -6.42548079e-01  -3.38678722e-01
  -3.75990306e-01  -3.72729650e-01  -3.64115922e-01  -6.85511528e-01
  -4.63777186e-02  -5.33515199e-01  -4.00683967e-01   2.02413369e-02
  -1.68665454e-01  -3.25054460e-01  -8.80108584e-02  -3.66439378e-01
  -3.92417207e-01  -3.79797239e-01  -5.11720615e-01  -5.75414863e-01
   2.02702199e-01  -3.50832766e-01   

In [101]:
# Try Bayesian Ridge Regression

ridge_regressor = linear_model.BayesianRidge(normalize=True)
ridge_regressor.fit(trainAllFeaturesCombined, score_array)

# Test Rigde Regression on Test Data

print('Ridge Regressor score: ' + str(ridge_regressor.score(testAllFeaturesCombined, testScorePruned)))
print('Ridge Regressor prediction: ' + str(ridge_regressor.predict(testAllFeaturesCombined)))

Ridge Regressor score: 0.11981181108
Ridge Regressor prediction: [ -1.81221438e-01   4.42262150e-01   1.35298998e-01   7.78244404e-02
  -3.01346970e-01  -3.39339573e-01  -6.41043423e-01  -1.16718757e-01
   3.71180505e-01   3.55132877e-01  -2.17012966e-01  -5.13529418e-02
  -1.50746344e-01  -1.74567138e-01  -6.21766170e-01  -7.39351696e-02
  -4.18772204e-01   4.89991377e-02   4.54013283e-02  -9.20327326e-02
  -1.07037159e+00  -1.44307536e-01  -7.53807453e-01  -7.11306734e-01
  -1.31138375e-02  -8.90923518e-01  -2.43645178e-01  -1.95878910e-01
  -4.04486410e-01  -6.23035607e-01  -6.32087946e-01   7.09232844e-01
  -6.63171120e-01  -3.33530036e-01  -7.27456214e-01  -3.63878178e-01
  -4.70257945e-01  -3.36976419e-01  -3.10720541e-01  -6.99478586e-01
  -1.67415348e-02  -6.42114298e-01  -4.19971713e-01   9.40666885e-02
  -1.64970900e-01  -2.59362291e-01  -9.75768836e-02  -3.72445475e-01
  -4.39769287e-01  -3.37344995e-01  -5.70173682e-01  -6.45513314e-01
   2.39570328e-01  -3.87401102e-01   3

In [102]:
# Try ensemble   Regression

ensemble_regressor = ensemble.AdaBoostRegressor(base_estimator=ridge_regressor,loss='exponential',learning_rate=0.1)
ensemble_regressor.fit(trainAllFeaturesCombined, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeaturesCombined, testScorePruned)))

ensemble Regressor score: 0.0829749475567


In [103]:
# Try ensemble   Regression

ensemble_regressor = ensemble.BaggingRegressor(n_estimators=10, random_state=0)
ensemble_regressor.fit(trainAllFeaturesCombined, score_array)

# Test ensemble Regression on Test Data

print('ensemble Regressor score: ' + str(ensemble_regressor.score(testAllFeaturesCombined, testScorePruned)))

ensemble Regressor score: -0.212489547832


In [109]:
fileForReport= open("fileForReport.txt","w+")

In [110]:
for i in range(len(predictedListOne)):
     fileForReport.write(str(predictedListOne[i])+"\n")

In [111]:
fileForReport.close() 

In [112]:
fileForReportTwo= open("fileForReportTwo.txt","w+")

In [113]:
for i in range(len(predictedListTwo)):
     fileForReportTwo.write(str(predictedListTwo[i])+"\n")

In [114]:
fileForReportTwo.close() 