In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
file_path = "movies_oscars.csv"

df = pd.read_csv(file_path)
df

Unnamed: 0,budget,original_title,popularity,release_date,revenue,runtime,status,title,vote_average,vote_count,winner,genres
0,30000000,Toy Story,21.946943,10/30/1995,373554033.0,81.0,Released,Toy Story,7.7,5415.0,FALSE,Animation
1,65000000,Jumanji,17.015539,12/15/1995,262797249.0,104.0,Released,Jumanji,6.9,2413.0,,Adventure
2,0,Grumpier Old Men,11.7129,12/22/1995,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,,Romance
3,16000000,Waiting to Exhale,3.859495,12/22/1995,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,,Comedy
4,0,Father of the Bride Part II,8.387519,2/10/1995,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...
45461,0,رگ خواب,0.072051,,0.0,90.0,Released,Subdue,4.0,1.0,,Drama
45462,0,Siglo ng Pagluluwal,0.178241,11/17/2011,0.0,360.0,Released,Century of Birthing,9.0,3.0,,Drama
45463,0,Betrayal,0.903007,8/1/2003,0.0,90.0,Released,Betrayal,3.8,6.0,FALSE,Action
45464,0,Satana likuyushchiy,0.003503,10/21/1917,0.0,87.0,Released,Satan Triumphant,0.0,0.0,,


In [3]:
df.dtypes

budget             object
original_title     object
popularity         object
release_date       object
revenue           float64
runtime           float64
status             object
title              object
vote_average      float64
vote_count        float64
winner             object
genres             object
dtype: object

In [4]:
##find non numerical values that contain jpg to drop from budget column
df[df['budget'].str.contains("jpg")]

Unnamed: 0,budget,original_title,popularity,release_date,revenue,runtime,status,title,vote_average,vote_count,winner,genres
19730,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'iso_639_1': 'en', 'name': 'English'}]",,1,,,,,,,,name
29503,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'iso_639_1': 'ja', 'name': '日本語'}]",,12,,,,,,,,name
35587,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'iso_639_1': 'en', 'name': 'English'}]",Beware Of Frost Bites,22,,,,,,,,name


In [5]:
##drop .jpg rows and convert budget column to integer
a = ['/ff9qCepilowshEtG2GYWwzt2bs4.jpg','/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg','/zaSf5OG7V8X8gqFvly88zDdRm46.jpg']

df = df[~df['budget'].isin(a)]
df['budget'].astype(str).astype(float)

0        30000000.0
1        65000000.0
2               0.0
3        16000000.0
4               0.0
            ...    
45461           0.0
45462           0.0
45463           0.0
45464           0.0
45465           0.0
Name: budget, Length: 45463, dtype: float64

In [6]:
##find unique values in winner column
df['winner'].unique()

array(['FALSE', nan, 'TRUE', 'winner'], dtype=object)

In [7]:
## replace NaN's with 0
df["winner"]=df["winner"].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
## function to create a binary value for those movies to be nominated
def nominated(nom):
    if nom != 0:
        nominated = 1
    else:
        nominated = 0
    return nominated

In [9]:
##create new "nominated" column which includes binary output
df["nominated"]=df["winner"].apply(nominated)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,budget,original_title,popularity,release_date,revenue,runtime,status,title,vote_average,vote_count,winner,genres,nominated
0,30000000,Toy Story,21.946943,10/30/1995,373554033.0,81.0,Released,Toy Story,7.7,5415.0,FALSE,Animation,1
1,65000000,Jumanji,17.015539,12/15/1995,262797249.0,104.0,Released,Jumanji,6.9,2413.0,0,Adventure,0
2,0,Grumpier Old Men,11.7129,12/22/1995,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,0,Romance,0
3,16000000,Waiting to Exhale,3.859495,12/22/1995,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,0,Comedy,0
4,0,Father of the Bride Part II,8.387519,2/10/1995,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,0,Comedy,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,0,رگ خواب,0.072051,,0.0,90.0,Released,Subdue,4.0,1.0,0,Drama,0
45462,0,Siglo ng Pagluluwal,0.178241,11/17/2011,0.0,360.0,Released,Century of Birthing,9.0,3.0,0,Drama,0
45463,0,Betrayal,0.903007,8/1/2003,0.0,90.0,Released,Betrayal,3.8,6.0,FALSE,Action,1
45464,0,Satana likuyushchiy,0.003503,10/21/1917,0.0,87.0,Released,Satan Triumphant,0.0,0.0,0,,0


In [10]:
##replace string values with binary 
df['winner']=df['winner'].replace('FALSE',0)
df['winner']=df['winner'].replace('TRUE',1)
df['winner']=df['winner'].replace('winner',1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,budget,original_title,popularity,release_date,revenue,runtime,status,title,vote_average,vote_count,winner,genres,nominated
0,30000000,Toy Story,21.946943,10/30/1995,373554033.0,81.0,Released,Toy Story,7.7,5415.0,0,Animation,1
1,65000000,Jumanji,17.015539,12/15/1995,262797249.0,104.0,Released,Jumanji,6.9,2413.0,0,Adventure,0
2,0,Grumpier Old Men,11.7129,12/22/1995,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,0,Romance,0
3,16000000,Waiting to Exhale,3.859495,12/22/1995,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,0,Comedy,0
4,0,Father of the Bride Part II,8.387519,2/10/1995,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,0,Comedy,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,0,رگ خواب,0.072051,,0.0,90.0,Released,Subdue,4.0,1.0,0,Drama,0
45462,0,Siglo ng Pagluluwal,0.178241,11/17/2011,0.0,360.0,Released,Century of Birthing,9.0,3.0,0,Drama,0
45463,0,Betrayal,0.903007,8/1/2003,0.0,90.0,Released,Betrayal,3.8,6.0,0,Action,1
45464,0,Satana likuyushchiy,0.003503,10/21/1917,0.0,87.0,Released,Satan Triumphant,0.0,0.0,0,,0


In [11]:
##convert winner column from string to float
df['winner'].astype(str).astype(float)

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
45461    0.0
45462    0.0
45463    0.0
45464    0.0
45465    0.0
Name: winner, Length: 45463, dtype: float64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   budget          45463 non-null  object 
 1   original_title  45463 non-null  object 
 2   popularity      45460 non-null  object 
 3   release_date    45376 non-null  object 
 4   revenue         45460 non-null  float64
 5   runtime         45203 non-null  float64
 6   status          45379 non-null  object 
 7   title           45460 non-null  object 
 8   vote_average    45460 non-null  float64
 9   vote_count      45460 non-null  float64
 10  winner          45463 non-null  int64  
 11  genres          43021 non-null  object 
 12  nominated       45463 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 4.9+ MB


In [13]:
df.to_csv("movies_clean.csv", index=False)