In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
path = '/Users/sabygalvan/Downloads/AppliedDeepLearningClass-master/dataTraining.csv'

In [4]:
dataTraining = pd.read_csv(os.path.join(path, 'dataTraining.csv'), encoding='UTF-8', index_col=0)
dataTesting = pd.read_csv(os.path.join(path, 'dataTesting.csv'), encoding='UTF-8', index_col=0)

In [6]:
dataTesting.head()

Unnamed: 0,year,title,plot
1,1999,Message in a Bottle,"who meets by fate , shall be sealed by fate ...."
4,1978,Midnight Express,"the true story of billy hayes , an american c..."
5,1996,Primal Fear,martin vail left the chicago da ' s office to ...
6,1950,Crisis,husband and wife americans dr . eugene and mr...
7,1959,The Tingler,the coroner and scientist dr . warren chapin ...


In [7]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [30]:
vect = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape

(7895, 1000)

In [31]:
# Definicion de funcion para preprocesar el texto
def preProssText(df, commonWord, numberCommonWords, rareWord, numberRareWords, spellCorrect):
  # Remove punctutation
  df['plot'] = df['plot'].str.replace('[^\w\s]','')
  
  # Lower case
  df['plot'] = df['plot'].apply(lambda x: " ".join(x.lower() for x in x.split()))
  
  # Remove Common words
  if commonWord == True:
    freq = pd.Series(' '.join(df['plot']).split()).value_counts()[:numberCommonWords]
    df['plot'] = df['plot'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
  
  # Remove Rare words
  if rareWord == True:
    freq = pd.Series(' '.join(df['plot']).split()).value_counts()[-numberRareWords:]
    df['plot'] = df['plot'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
  
  # Spelling correction -> Tener cuidado puede cambiar los valores. Ej: "ur" por "or" en vez de "your"
  if spellCorrect == True:
    from textblob import TextBlob
    df['plot'] = df['plot'].apply(lambda x: str(TextBlob(x).correct()))
    
  return df

In [32]:
# Pre-procesamiento de texto
dataTraining = preProssText(dataTraining, commonWord=True, numberCommonWords=10, rareWord=True, numberRareWords=10, spellCorrect=False)
dataTesting = preProssText(dataTesting, commonWord=True, numberCommonWords=10, rareWord=True, numberRareWords=10, spellCorrect=False)

In [33]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most story single father takes eight year old ...,"[Short, Drama]",8.0
900,2008,How to Be a Serial Killer,serial killer decides teach secrets satisfying...,"[Comedy, Crime, Horror]",5.6
6724,1941,A Woman's Face,sweden female blackmailer disfiguring facial s...,"[Drama, Film-Noir, Thriller]",7.2
4704,1954,Executive Suite,friday afternoon new york president tredway co...,[Drama],7.4
2582,1990,Narrow Margin,los angeles editor publishing house carol hunn...,"[Action, Crime, Thriller]",6.6


In [34]:
dataTesting.head()

Unnamed: 0,year,title,plot
1,1999,Message in a Bottle,meets fate shall be sealed fate theresa osborn...
4,1978,Midnight Express,true story billy hayes an american college stu...
5,1996,Primal Fear,martin vail left chicago da office become succ...
6,1950,Crisis,husband wife americans dr eugene mrs helen fer...
7,1959,The Tingler,coroner scientist dr warren chapin researching...


In [35]:
# define a function that accepts text and returns a list of lemmas
import nltk
nltk.download('wordnet')

# define a function that accepts text and returns a list of stem
def split_into_stem(text):
    stemmer = SnowballStemmer('english')
    text = text.lower()
    words = text.split()
    return [stemmer.stem(word) for word in words]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VEC1602\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
def split_into_lemmas(text):
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    text = text.lower()
    words = text.split()
    return [wordnet_lemmatizer.lemmatize(word) for word in words]

In [37]:
# Text processing
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

In [38]:
vect = CountVectorizer(analyzer=split_into_lemmas, ngram_range=(1, 2), max_features=1000)
X_dtm = vect.fit_transform(dataTraining['plot'])
X_dtm.shape

(7895, 1000)

In [42]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most story single father takes eight year old ...,"[Short, Drama]",8.0
900,2008,How to Be a Serial Killer,serial killer decides teach secrets satisfying...,"[Comedy, Crime, Horror]",5.6
6724,1941,A Woman's Face,sweden female blackmailer disfiguring facial s...,"[Drama, Film-Noir, Thriller]",7.2
4704,1954,Executive Suite,friday afternoon new york president tredway co...,[Drama],7.4
2582,1990,Narrow Margin,los angeles editor publishing house carol hunn...,"[Action, Crime, Thriller]",6.6


In [43]:
dataTraining['genres'] = dataTraining['genres'].map(lambda x: eval(x))

le = MultiLabelBinarizer()
y_genres = le.fit_transform(dataTraining['genres'])

TypeError: eval() arg 1 must be a string, bytes or code object

In [44]:
y_genres.shape

(7895, 24)

In [45]:
X_train, X_test, y_train_genres, y_test_genres = train_test_split(X_dtm, y_genres, test_size=0.1, random_state=42)

In [47]:
# Keras NN
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Input, Embedding, LSTM
from keras.optimizers import RMSprop
from keras.callbacks import History
from keras.models import Model

In [55]:
# Modelo alternativo
top_words = 1000
max_review_length = 1000

# Set the model
def textModel(drop=1, optimizer="RMSprop"):
    model = Sequential()
    model.add(Embedding(top_words, 32, input_length=max_review_length))
    model.add(LSTM(256))
    model.add(Dropout(drop))
    model.add(Dense(24, activation='sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model



In [52]:
ix_train=np.random.choice(X_train.shape[0], 1000)
ix_test=np.random.choice(X_test.shape[0], 100)
model.fit(X_train[ix_train], y_train_genres[ix_train], epochs=3, batch_size=128, validation_data=(X_test[ix_test], y_test_genres[ix_test]))
#model.fit(X_dtm, y_genres, epochs=3, batch_size=10)
# epochs=5, batch_size=5 -> acc: 0.18

Train on 1000 samples, validate on 100 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2378b75f4e0>

In [58]:
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [60]:
# Definicion del modelo KerasClassifier
model = KerasClassifier(build_fn=textModel, verbose=0)

# Lista de parametros a evaluar
parametros = {
    #"activation": ["softmax", "elu", "selu", "softplus", "softsign", "relu" ,"tanh", "sigmoid", "hard_sigmoid", "linear"],
    "optimizer": ["SGD", "RMSprop", "Adagrad", "Adadelta", "Adam", "Adamax", "Nadam"], 
    "drop": [0.2,0.5,1],
    #"kernel": ["random_normal", "he_uniform", "lecun_normal", "he_normal", "glorot_uniform", "glorot_normal", "lecun_uniform", "zeros", "ones"], 
    #"neurons": [5,10,50,100]
    #"moreHiddenLayers": [True, False]
}

# Busqueda aleatoria entre los parametros
random_search = RandomizedSearchCV(estimator=model, param_distributions=parametros)

In [None]:
random_result=random_search.fit(X_train, y_train_genres)

In [None]:
random_result

In [None]:
print("Mejor: %f using %s" % (random_result.best_score_, random_result.best_params_))