# Word counts with Pipeline
This notebook is an exploration of using `sklearn` pipelines. The features and model used will be identical to the one in `2_wordcounts.ipynb`.  

Submission accuracy: `0.60200`

In [1]:
%%script false
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [2]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from os.path import join
from tqdm import tqdm

tqdm.pandas()

In [3]:
src = 'data'
df_train = pd.read_csv(join(src, 'labeledTrainData.tsv'), sep='\t')
df_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


## Data cleaning

In [4]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

def clean_sentence(sentence):
    removed_markup = BeautifulSoup(sentence, 'html.parser').text
    removed_punctuation = re.sub(r'[^a-zA-Z]', ' ', removed_markup)
    tokens = removed_punctuation.lower().split()
    removed_stopwords = [w for w in tokens if w not in stopwords]
    lemmatized = [lemmatizer.lemmatize(w) for w in removed_stopwords]
    return ' '.join(lemmatized)

clean_all = FunctionTransformer(lambda reviews: reviews.progress_apply(clean_sentence),
                                validate=False)

## Feature extraction

In [5]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)

## Classifier

In [6]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

## Pipeline & training

In [7]:
%%time
pipeline = Pipeline([
    ('clean', clean_all),
    ('vec', vectorizer),
    ('clf', clf),
])
pipeline.fit(df_train['review'], df_train['sentiment'])

100%|██████████| 25000/25000 [00:27<00:00, 903.16it/s] 


CPU times: user 1min 56s, sys: 480 ms, total: 1min 57s
Wall time: 1min 57s


## Testing

In [8]:
df_test = pd.read_csv(join(src, 'testData.tsv'), sep='\t')
df_test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [9]:
%%time
pred = pipeline.predict(df_test['review'])

100%|██████████| 25000/25000 [00:30<00:00, 821.40it/s]


CPU times: user 34.8 s, sys: 265 ms, total: 35 s
Wall time: 35.1 s


In [10]:
output = pd.DataFrame({'id': df_test['id'], 'sentiment': pred})
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [11]:
output.shape

(25000, 2)

In [12]:
output.to_csv('submission/wordcount_randomforest_pipeline.csv', index=False)