In [1]:
import pandas as pd
import re
from sklearn.model_selection import StratifiedKFold

In [2]:
raw_data = pd.read_csv("../raw_data.csv", sep="|")
raw_data.head()

Unnamed: 0,movie_name,movie_genre,movie_description
0,Werewolf by Night,"Fantasy, Action, Adventure, Horror","On a dark and somber night, a secret cabal of ..."
1,Halloween Ends,"Holiday, Horror, Mystery & thriller",This is Laurie Strode's last stand. After 45 y...
2,Luckiest Girl Alive,"Crime, Drama","Luckiest Girl Alive centers on Ani FaNelli, a ..."
3,Hellraiser,"Horror, Mystery & thriller",A new take on Clive Barker's 1987 horror class...
4,Mr. Harrigan's Phone,Horror,"When Craig, a young boy living in a small town..."


In [3]:
raw_data.shape

(8986, 3)

In [4]:
raw_data["target"] = raw_data.movie_genre.apply(lambda x: re.findall(r"\w+", x)[0])
raw_data.head()

Unnamed: 0,movie_name,movie_genre,movie_description,target
0,Werewolf by Night,"Fantasy, Action, Adventure, Horror","On a dark and somber night, a secret cabal of ...",Fantasy
1,Halloween Ends,"Holiday, Horror, Mystery & thriller",This is Laurie Strode's last stand. After 45 y...,Holiday
2,Luckiest Girl Alive,"Crime, Drama","Luckiest Girl Alive centers on Ani FaNelli, a ...",Crime
3,Hellraiser,"Horror, Mystery & thriller",A new take on Clive Barker's 1987 horror class...,Horror
4,Mr. Harrigan's Phone,Horror,"When Craig, a young boy living in a small town...",Horror


In [5]:
top_6_genre = raw_data.target.value_counts().nlargest(6).keys().tolist()
top_6_genre

['Comedy', 'Drama', 'Horror', 'Mystery', 'Action', 'Kids']

In [6]:
raw_data = raw_data[raw_data.target.isin(top_6_genre)].reset_index(drop=True)
raw_data.head()

Unnamed: 0,movie_name,movie_genre,movie_description,target
0,Hellraiser,"Horror, Mystery & thriller",A new take on Clive Barker's 1987 horror class...,Horror
1,Mr. Harrigan's Phone,Horror,"When Craig, a young boy living in a small town...",Horror
2,Hocus Pocus 2,"Kids & family, Holiday, Comedy, Fantasy",It's been 29 years since someone lit the Black...,Kids
3,X,"Horror, Mystery & thriller","In 1979, a group of young filmmakers set out t...",Horror
4,Piggy,"Horror, Mystery & thriller, Drama",With the summer sun beating down on her rural ...,Horror


In [7]:
raw_data.shape

(6300, 4)

In [8]:
def create_folds(data_frame, targets, folds=5, seed=42, shuffle=True, fold_column="fold"):
    cv_strategy = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=shuffle)
    folds = cv_strategy.split(X=data_frame, y=targets)
        
    for fold, (train_indexes, validation_indexes) in enumerate(folds):
        data_frame.loc[validation_indexes, fold_column] =  int(fold+1)
        
    data_frame[fold_column] = data_frame[fold_column].astype(int)
    
    return data_frame

In [9]:
raw_data = create_folds(data_frame=raw_data, 
                        targets=raw_data["target"].values, 
                        folds=8, 
                        seed=42, 
                        shuffle=True, 
                        fold_column="fold")
raw_data.head()

Unnamed: 0,movie_name,movie_genre,movie_description,target,fold
0,Hellraiser,"Horror, Mystery & thriller",A new take on Clive Barker's 1987 horror class...,Horror,8
1,Mr. Harrigan's Phone,Horror,"When Craig, a young boy living in a small town...",Horror,1
2,Hocus Pocus 2,"Kids & family, Holiday, Comedy, Fantasy",It's been 29 years since someone lit the Black...,Kids,3
3,X,"Horror, Mystery & thriller","In 1979, a group of young filmmakers set out t...",Horror,4
4,Piggy,"Horror, Mystery & thriller, Drama",With the summer sun beating down on her rural ...,Horror,7


In [10]:
raw_data.fold.value_counts()

1    788
3    788
4    788
2    788
8    787
7    787
5    787
6    787
Name: fold, dtype: int64

In [11]:
train_data = raw_data[raw_data.fold != 1].drop(columns=["movie_genre", "fold"]).reset_index(drop=True)
train_data.head()

Unnamed: 0,movie_name,movie_description,target
0,Hellraiser,A new take on Clive Barker's 1987 horror class...,Horror
1,Hocus Pocus 2,It's been 29 years since someone lit the Black...,Kids
2,X,"In 1979, a group of young filmmakers set out t...",Horror
3,Piggy,With the summer sun beating down on her rural ...,Horror
4,Deadstream,After a public controversy left him disgraced ...,Horror


In [12]:
val_data = raw_data[raw_data.fold == 1].drop(columns=["movie_genre", "fold"]).reset_index(drop=True)
val_data.head()

Unnamed: 0,movie_name,movie_description,target
0,Mr. Harrigan's Phone,"When Craig, a young boy living in a small town...",Horror
1,To Leslie,Leslie (Andrea Riseborough) is a West Texas si...,Drama
2,Hellraiser,Sexual deviant Frank (Sean Chapman) inadverten...,Horror
3,You Won't Be Alone,Set in an isolated mountain village in 19th ce...,Horror
4,"Confess, Fletch","In this delightful comedy romp, Jon Hamm stars...",Comedy


In [13]:
train_data.to_csv("../train.csv", index=False)
val_data.to_csv("../val.csv", index=False)