**Import**

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import datasets, linear_model, metrics, model_selection, pipeline, preprocessing
import ast
import joblib

**Data loading and preprocessing**

In [4]:
train = pd.read_csv('/Users/karina/Desktop/ml/HWs/hw2/train.csv')
test = pd.read_csv('/Users/karina/Desktop/ml/HWs/hw2/test.csv')

In [5]:
train.head()

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"[u'drama', u'romance']"
1,1,1,Are you sure you're okay? You're pale. <BR> I...,[u'drama']
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,[u'comedy']
3,3,3,I could have lost my fucking hands. <BR> That ...,"[u'mystery', u'thriller']"
4,4,4,Stick with me on this Gloria. I need you... <...,"[u'crime', u'thriller']"


In [6]:
train1 = train['genres'].str.extract(r'\[(.*)\]')
train1[0] = train1[0].str.replace("u'", '')
train1[0] = train1[0].str.replace("'", '')

In [7]:
train['genres'] = train1

In [8]:
train.head()

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"drama, romance"
1,1,1,Are you sure you're okay? You're pale. <BR> I...,drama
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,comedy
3,3,3,I could have lost my fucking hands. <BR> That ...,"mystery, thriller"
4,4,4,Stick with me on this Gloria. I need you... <...,"crime, thriller"


In [9]:
test.head()

Unnamed: 0,id,dialogue
0,0,Boy! Did you see the way Mama whopped that dep...
1,1,"Gordon, the insurance people are balking on th..."
2,2,Very fancy. Did you design the bottle? <BR> W...
3,3,It makes me so mad. Steven Schwimmer ready to ...
4,4,Something ought to loosen him up ... how comes...


**CountVectorizer**

In [10]:
vectorizer = CountVectorizer(stop_words={'english'})

vectorized_train_data = vectorizer.fit_transform(train['dialogue'])
vectorized_test_data = vectorizer.transform(test['dialogue'])

In [11]:
len(vectorizer.vocabulary_)

35706

**Modeling**

In [12]:
model = linear_model.LogisticRegression()
model.fit(vectorized_train_data, train['genres'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)

In [14]:
train_target = train['genres'].to_numpy()

In [15]:
print(f'train accuracy: {metrics.accuracy_score(train_target, train_preds):.3f}')

train accuracy: 0.864


In [16]:
train_preds

array(['drama', 'drama', 'comedy', ..., 'drama', 'drama, romance',
       'crime, drama'], dtype=object)

In [17]:
test_preds

array(['drama', 'drama, thriller', 'drama', ..., 'comedy, drama', 'drama',
       'comedy, romance'], dtype=object)

In [18]:
solution = pd.DataFrame(test_preds)
solution.columns=['genres'] 

In [19]:
solution.head()

Unnamed: 0,genres
0,drama
1,"drama, thriller"
2,drama
3,drama
4,"action, thriller"


In [20]:
test_obj = "Are you sure you're okay? You're pale."
vectorized_test_obj = vectorizer.transform([test_obj])

In [21]:
 model.predict(vectorized_test_obj)[0]

'drama, romance'

**Dump vectorizer & model**

In [22]:
with open('genres_vectorizer_dump.pkl', 'wb') as output_file:
    joblib.dump(vectorizer, output_file)

with open('genres_model_dump.pkl', 'wb') as output_file:
    joblib.dump(model, output_file)
    
with open('genres_target_dump.pkl', 'wb') as output_file:
    joblib.dump(train_target, output_file)