# Dataframe Clean: IMDB

## Intro

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

### Set Dataframe

In [None]:
imdb = pd.read_csv('IMDB.csv')
# imdb.head()

imdb2 = imdb.copy()
# imdb2.head()

### Dataframe info

In [None]:
imdb2.info()

In [None]:
imdb2.shape

### Delete duplicates

In [None]:
imdb2.duplicated(keep=False).sum()

In [None]:
imdb2.drop_duplicates(inplace=True)
imdb2.duplicated(keep=False).sum()

## Clean null values

First lets find the null values in every column of the dataset.

In [None]:
#imdb2.isnull()
imdb2.isnull().sum()

### Color

In [None]:
imdb2["color"].value_counts()
# imdb2["color"].value_counts(dropna=False)

In [None]:
# As there is only one movie in Black and White and having a lot of missing values, we can drop this column
del(imdb2["color"])
# imdb2.drop(columns=["color"], inplace=True)

In [None]:
# Check
imdb2.shape

Si se quisiera sustituir los datos null de la variable por alguno, lo mas probable es que siendo la mayoria "color", los que falten lo sean también.

### Director Name

In [None]:
imdb2["director_name"].value_counts(dropna=False).head(20)

The information looks interesting for the model, so we can't eliminate it.

Let's see what the null values have in common

In [None]:
imdb2[imdb2["director_name"].isnull()]

There is nothing clear in common, so lets put all null values as "Unknown"

In [None]:
imdb2["director_name"].fillna("Unknown", inplace=True)
imdb2["director_name"].value_counts(dropna=False).head(20)

### Gross

In [None]:
imdb2.hist("gross")

As the values are different and they seem interesting for the model.

In [None]:
imdb2[imdb2["gross"].isnull()]

The null values don't have anything in common

In [None]:
imdb2[imdb2["gross"]>4*10**8]

The values that are outside the top values are not very interesting.

In [None]:
imdb2["gross"].describe()

As both the media and the median are similar, let's place the median value in all the nulls

In [None]:
imdb2["gross"].fillna(imdb2["gross"].median(), inplace=True)

### Genres

In [None]:
imdb2["genres"].describe()

In [None]:
imdb2["genres"].value_counts(dropna=False)

In [None]:
# imdb2["genres"].dropna(inplace=True) This is not working as it is a string and not a dataframe
imdb2.dropna(subset=["genres"], inplace=True)

In [None]:
imdb2["genres"].value_counts(dropna=False)

### Budget

In [None]:
imdb2.hist("budget")

In [None]:
imdb2["budget"].describe()

In [None]:
imdb2["budget"].fillna(imdb2["budget"].median(), inplace=True)

## Analyse Numeric Variables

In [None]:
imdb3 = imdb2.copy()

In [None]:
imdb3.describe()

Explanation of what values are weird

### Duration

In [None]:
imdb3.loc[imdb3["duration"]>300, "duration"]

### Title Year

In [None]:
imdb3.hist("title_year")

In [None]:
# imdb3.loc[imdb3["title_year"]<1800, "title_year"]
# imdb3["title_year"] = np.where(imdb3["title_year"]<1800, imdb3["title_year"].median, imdb3["title_year"])

### IMDB Score

In [None]:
imdb3.hist("imdb_score")

In [None]:
imdb3.drop((imdb3[imdb3["imdb_score"]<0]).index, inplace=True)

## Analyse Categoric Variables

In [None]:
imdb4 = imdb3.copy()

### genres

In [None]:
imdb4["genres"].value_counts()

We should separate values.

In [None]:
genero_list = imdb4["genres"].str.split("|", expand=True)
genero_list.fillna("-", inplace=True)

for i in genero_list:
    imdb4["genre_"+str(i)] = genero_list[i]

# Delete the original genre column, as it is now a duplicate
del(imdb4["genres"])
# imdb4.drop(columns=["genres"], inplace=True)
imdb4.head()

### actors

In [None]:
imdb4["actors"].value_counts()

We should separate values.

In [None]:
actors_list = imdb4["actors"].str.split(",", expand=True)
actors_list.fillna("-", inplace=True)

for i in actors_list:
    imdb4["actor_"+str(i)] = actors_list[i]

# Delete the original actors column, as it is now a duplicate
del(imdb4["actors"])

imdb4.head()

### movie Title

In [None]:
imdb4["movie_title"].value_counts()

There are two movies with the same name.

In [None]:
# imdb4.drop(imdb4[(imdb4["movie_title"])])

### country

In [None]:
imdb4["country"].value_counts()

We have different values that mean the same: USA, United States, usa

In [None]:
imdb4["country"] = np.where(imdb4["country"]=="United States" | imdb4["country"]=="usa", "USA", imdb4["country"])