In [None]:
# Data loading

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

train_path = '../input/train.tsv'
test_path = '../input/test.tsv'
train_data = pd.read_csv(train_path, sep="\t")
train_data.head()
test_data = pd.read_csv(test_path, sep="\t")

In [None]:
# Vectorize reviews by building a vocabulary, and by encoding each review as a 'one-hot' vector based on the vocabulary

from sklearn.feature_extraction.text import TfidfVectorizer
max_features = 5000 # only consider the most frequent words (there are 14955 different words in the dataset)

vectorizer = TfidfVectorizer(stop_words='english', # see https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
                             max_features=max_features)
X = vectorizer.fit_transform(train_data['Phrase'])
vocabulary = vectorizer.get_feature_names()

y = train_data['Sentiment']

In [None]:
# Training / Validation split

train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.99, test_size=0.01)

print('Training / Validation split:')
print('train_X: ', train_X.shape, 'val_X:', val_X.shape)

In [None]:
# Decision Tree
# NOTE: we use the random forest to produce a submission
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(train_X, train_y)

predictions = decision_tree.predict(val_X)
print('Decision tree accuracy: ', accuracy_score(val_y, predictions))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

random_forest = RandomForestClassifier()
random_forest = random_forest.fit(train_X, train_y)

predictions = random_forest.predict(val_X)
print('Random forest accuracy: ', accuracy_score(val_y, predictions))

In [None]:
# Extra trees
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

et = ExtraTreesClassifier()
et = et.fit(train_X, train_y)

predictions = et.predict(val_X)
print('Extra trees accuracy: ', accuracy_score(val_y, predictions))

In [None]:
test_X = vectorizer.transform(test_data['Phrase'])
test_predictions = et.predict(test_X)
submission = pd.concat([test_data['PhraseId'], pd.DataFrame({'Sentiment': test_predictions})], axis=1)
submission.head()
submission.to_csv('submission.csv', index=False)