In [12]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [13]:
overallDataset = pd.read_csv("data/MovieGenre.csv", sep=",", encoding = "ISO-8859-1")
overallDataset = overallDataset[["imdbId", "Genre"]]
# We need to drop NaNs
overallDataset = overallDataset.dropna()

In [14]:
overallDataset

Unnamed: 0,imdbId,Genre
0,114709,Animation|Adventure|Comedy
1,113497,Action|Adventure|Family
2,113228,Comedy|Romance
3,114885,Comedy|Drama|Romance
4,113041,Comedy|Family|Romance
...,...,...
40103,83168,Drama
40104,82875,Comedy
40105,815258,Horror
40106,79142,Action|Comedy


In [15]:
# We need to filter overallDataset
# For when imdbId is not in the list of 0 byte images
f = open("0byte_images.txt", "r")
zero_byte_images = f.read()
zero_byte_images = zero_byte_images.split("\n")
zero_byte_images = list(filter(None, zero_byte_images))
for i in range(len(zero_byte_images)):
    process_record = zero_byte_images[i].replace("posters/", "").replace(".jpg", "")
    if process_record == "":
        print(zero_byte_images[i])
    zero_byte_images[i] = int(process_record)
    
# Read in bad_jpegs
f = open("bad_jpegs.txt", "r")
bad_jpegs = f.read()
bad_jpegs = bad_jpegs.split("\n")
bad_jpegs = list(filter(None, bad_jpegs))
for i in range(len(bad_jpegs)):
    bad_jpegs[i] = int(bad_jpegs[i])
    
remove_images = zero_byte_images + bad_jpegs

before_length = len(overallDataset)
print("Before filtering out zero byte images, we had " + str(before_length) + " records")
overallDataset = overallDataset[~overallDataset['imdbId'].isin(remove_images)]
print("Now we have " + str(len(overallDataset)) + " records")
print("We have removed " + str(before_length - len(overallDataset)) + " records")

Before filtering out zero byte images, we had 39963 records
Now we have 39137 records
We have removed 826 records


In [16]:
def genre_discover(genre):
    category = None
    if "|" in genre:
        genre_split = genre.split("|")
        if (genre_split[0] == "Drama") or (genre_split[0] == "Comedy") or (genre_split[0] == "Action"):
            if (genre_split[1] == "Drama"):
                if len(genre_split) == 3:
                    category = genre_split[2]
                else:
                    category = genre_split[1]
            else:
                category = genre_split[1]
        else:
            category = genre_split[0]
    else:
        category = genre
    return category

In [17]:
overallDataset['Genre_1'] = overallDataset.apply(lambda x: genre_discover(x['Genre']), axis=1)
print("Number of Records originally: " + str(len(overallDataset["Genre_1"])))
print("Number of unique values in Genre_1 originally: " + str(len(overallDataset["Genre_1"].unique())))
#overallDataset["Genre_1"].value_counts()

Number of Records originally: 39137
Number of unique values in Genre_1 originally: 26


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  overallDataset['Genre_1'] = overallDataset.apply(lambda x: genre_discover(x['Genre']), axis=1)


In [18]:
genre_counts = overallDataset["Genre_1"].value_counts()
filt_genre_counts = genre_counts[genre_counts >= 1100].index
fixedOverallDataset = overallDataset[overallDataset['Genre_1'].isin(filt_genre_counts)]

new_balanced_df = pd.DataFrame(columns=["imdbId", "Genre", "Genre_1"])
for genre in filt_genre_counts:
    cut_temp_df = fixedOverallDataset[fixedOverallDataset["Genre_1"] == genre].iloc[:1200]
    new_balanced_df = pd.concat([new_balanced_df, cut_temp_df])
    
genre_category_numbers = list(filt_genre_counts)
def label_id_add(genre):
    return int(genre_category_numbers.index(genre))

    
new_balanced_df = new_balanced_df.drop(['Genre'], axis=1)
new_balanced_df = new_balanced_df.rename(columns={"Genre_1": "Genre"})
new_balanced_df['Label'] = new_balanced_df.apply(lambda x: label_id_add(x['Genre']), axis=1)
new_balanced_df = new_balanced_df.drop(['Genre'], axis=1)

In [19]:
new_balanced_df["Label"].value_counts()

0    1200
1    1200
2    1200
3    1200
4    1200
5    1200
6    1200
7    1200
8    1200
9    1200
Name: Label, dtype: int64

In [20]:
original_size = len(new_balanced_df)
train, test = train_test_split(new_balanced_df, test_size=0.1)
train_size = len(train)
test_size = len(test)

print("The new number of records was " + str(original_size) + ".")
print("The train size was " + str(train_size) + " and the test size was " + str(test_size))
print("Number of unique values Labels: " + str(len(new_balanced_df["Label"].unique())))
print("The columns we save out are: " + str(list(new_balanced_df.columns)))
assert original_size == (train_size + test_size)

The new number of records was 12000.
The train size was 10800 and the test size was 1200
Number of unique values Labels: 10
The columns we save out are: ['imdbId', 'Label']


In [21]:
test.to_csv("test_labels.csv", index=False, header=False)
train.to_csv("train_labels.csv", index=False, header=False)