Score: `0.59375`

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from os.path import join
from tqdm import tqdm

In [2]:
from utils import clean_sentence, extract_features

## Training
Extract features using the `CountVectorizer` and fit them with a `RandomForestClassifier`.

In this case, the model learns the relationship between the frequencies of words in the vocabulary, and whether such words indicate a positive or negative sentiment.

In [3]:
src = 'data'
df_train = pd.read_csv(join(src, 'cleanedTrainData.csv'), sep='\t')
df_train

Unnamed: 0,id,sentiment,review,review_cleaned
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war world timothy hines entertaining f...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film start manager nicholas bell giving welcom...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...
5,8196_8,1,I dont know why people think this is such a ba...,dont know people think bad movie got pretty go...
6,7166_2,0,"This movie could have been very good, but come...",movie could good come way short cheesy special...
7,10633_1,0,I watched this video at a friend's house. I'm ...,watched video friend house glad waste money bu...
8,319_1,0,"A friend of mine bought this film for £1, and ...",friend mine bought film even grossly overprice...
9,8713_10,1,<br /><br />This movie is full of references. ...,movie full reference like mad max ii wild one ...


In [4]:
train_features = extract_features(df_train['review_cleaned'])
train_features.shape

(25000, 5000)

In [5]:
%%time
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_features, df_train['sentiment'])

Wall time: 1min 20s


## Testing
Similarly extract features for the test data, and use the trained classifier to predict.

In [6]:
df_test = pd.read_csv(join(src, 'testData.tsv'), sep='\t')
df_test

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...
5,2913_8,"...as valuable as King Tut's tomb! (OK, maybe ..."
6,4396_1,This has to be one of the biggest misfires eve...
7,395_2,"This is one of those movies I watched, and won..."
8,10616_1,The worst movie i've seen in years (and i've s...
9,9074_9,"Five medical students (Kevin Bacon, David Labr..."


In [7]:
df_test['review_cleaned'] = [clean_sentence(s) for s in tqdm(df_test['review'])]
df_test

100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [20:59<00:00, 19.85it/s]


Unnamed: 0,id,review,cleaned_reviews
0,12311_10,Naturally in a film who's main themes are of m...,naturally film main theme mortality nostalgia ...
1,8348_2,This movie is a disaster within a disaster fil...,movie disaster within disaster film full great...
2,5828_4,"All in all, this is a movie for kids. We saw i...",movie kid saw tonight child loved one point ki...
3,7186_2,Afraid of the Dark left me with the impression...,afraid dark left impression several different ...
4,12128_7,A very accurate depiction of small time mob li...,accurate depiction small time mob life filmed ...
5,2913_8,"...as valuable as King Tut's tomb! (OK, maybe ...",valuable king tut tomb ok maybe valuable worth...
6,4396_1,This has to be one of the biggest misfires eve...,one biggest misfire ever script nice could end...
7,395_2,"This is one of those movies I watched, and won...",one movie watched wondered watch find interest...
8,10616_1,The worst movie i've seen in years (and i've s...,worst movie seen year seen lot movie acting te...
9,9074_9,"Five medical students (Kevin Bacon, David Labr...",five medical student kevin bacon david labracc...


In [9]:
test_features = extract_features(df_test['review_cleaned'])
pred = clf.predict(test_features)

In [12]:
output = pd.DataFrame({'id': df_test['id'], 'sentiment': pred})
output

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,1
5,2913_8,0
6,4396_1,1
7,395_2,1
8,10616_1,0
9,9074_9,0


In [13]:
output.to_csv('submission/bow_randomforest.csv', index=False)