In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import warnings 
warnings.filterwarnings("ignore")

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
train = pd.read_csv('/kaggle/input/moviescriptsdata/MovieScriptsParticipantsData/Train.csv')
test = pd.read_csv('/kaggle/input/moviescriptsdata/MovieScriptsParticipantsData/Test.csv')
sample_sub = pd.read_excel('/kaggle/input/moviescriptsdata/MovieScriptsParticipantsData/Movie_Scripts_Sample_Submission.xlsx')

script_folder = '/kaggle/input/moviescriptsdata/MovieScriptsParticipantsData/Scripts/'

In [3]:
train['Script'] = [open(script_folder + os.sep + file, "r").read() for file in train['File_Name']]
test['Script'] = [open(script_folder + os.sep + file, "r").read() for file in test['File_Name']]

In [4]:
train = train.append(train[train['Labels'] == 18])
train.reset_index(drop=True, inplace=True)

In [5]:
train.head()

Unnamed: 0,File_Name,Labels,Script
0,file_2180.txt,8,"\t\t\tCrouching Tiger, Hidden Dragon\n\n\t\t\t..."
1,file_693.txt,4,"""MUMFO..."
2,file_2469.txt,6,MAX PAYNE\n\n ...
3,file_2542.txt,6,SLUMDOG MILLIONAIRE\n\n ...
4,file_378.txt,16,<b><!--\n\n</b>if (window!= top)\n\ntop.locati...


In [7]:
import re 

def clean_text(txt):
    txt = re.sub(r'\d', '', txt)
    txt = re.sub(r'\s+', ' ', txt)
    return txt

train['Script'] = train['Script'].apply(clean_text)
test['Script'] = test['Script'].apply(clean_text)

In [8]:
y_pred_total = []

for i in range(3): 
    
    X_train, X_valid, y_train, y_valid = train_test_split(train['Script'], train['Labels'], test_size=0.30, stratify=train['Labels'], random_state=i)
    
    tfv = TfidfVectorizer(min_df=3, max_features=2500, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', 
                          ngram_range=(1, 1), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words = 'english')

    tfv.fit(list(X_train) + list(X_valid))
    X_train_tfv =  tfv.transform(X_train) 
    X_valid_tfv = tfv.transform(X_valid)     
    
    # validation set
    nb = MultinomialNB(alpha=0.5)
    nb.fit(X_train_tfv, y_train)
    y_pred = nb.predict_proba(X_valid_tfv)  
    print('LogLoss:', log_loss(y_valid, y_pred))
    
    # for prediction on test set
    full_text = list(train['Script'].values) + list(test['Script'].values)
    tfv.fit(full_text)
    train_tfidf = tfv.transform(train['Script'])
    test_tfidf = tfv.transform(test['Script'])
    y_pred = nb.predict_proba(test_tfidf)
    
    y_pred_total.append(y_pred)

LogLoss: 2.3009984562136903
LogLoss: 2.3153217718936334
LogLoss: 2.285426386861364


In [9]:
np.mean(y_pred_total,0)

array([[0.06052975, 0.05733965, 0.05360063, ..., 0.06024438, 0.04739188,
        0.02599997],
       [0.04932956, 0.04963266, 0.06350208, ..., 0.06394144, 0.05790539,
        0.03098648],
       [0.06916327, 0.06586735, 0.0499741 , ..., 0.08043775, 0.04096369,
        0.01867417],
       ...,
       [0.04165418, 0.04951884, 0.07651315, ..., 0.04893231, 0.06296459,
        0.03541658],
       [0.05406455, 0.05406271, 0.06951553, ..., 0.05294551, 0.06423305,
        0.03172198],
       [0.05301252, 0.04988507, 0.08053355, ..., 0.04293092, 0.06364365,
        0.04000425]])

In [10]:
y_pred = np.mean(y_pred_total,0)

sample_sub = sample_sub.loc[:, ['File_Name']]
sample_sub = pd.concat([sample_sub, pd.DataFrame(y_pred)], axis=1, sort=False)

sample_sub.to_excel('Output.xlsx', index=False)

In [11]:
sample_sub.head(5)

Unnamed: 0,File_Name,0,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,20,21
0,file_2300.txt,0.06053,0.05734,0.053601,0.004705,0.075927,0.080323,0.058893,0.057995,0.078566,...,0.014026,0.042897,0.078805,0.071776,0.049087,0.004804,0.004547,0.060244,0.047392,0.026
1,file_809.txt,0.04933,0.049633,0.063502,0.005316,0.049329,0.074316,0.05402,0.060672,0.081275,...,0.016045,0.048516,0.079152,0.058026,0.053983,0.005194,0.00529,0.063941,0.057905,0.030986
2,file_1383.txt,0.069163,0.065867,0.049974,0.003195,0.068037,0.069784,0.062389,0.050649,0.078285,...,0.009914,0.035558,0.084684,0.062238,0.067102,0.003214,0.003265,0.080438,0.040964,0.018674
3,file_983.txt,0.072292,0.067531,0.033305,0.001725,0.098041,0.077049,0.105504,0.035494,0.066595,...,0.005807,0.02401,0.059197,0.069921,0.057128,0.001814,0.001715,0.105224,0.024689,0.011101
4,file_1713.txt,0.053238,0.048428,0.069871,0.00583,0.04735,0.068709,0.050298,0.065208,0.081701,...,0.017798,0.053521,0.086088,0.047948,0.051308,0.005973,0.005982,0.058606,0.064228,0.032058


In [12]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

def create_download_link(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(sample_sub)

In [None]:
#Public LB 3.00619