In [1]:
import pandas as pd
from datetime import datetime

data = pd.read_csv('../data/netflix_titles.csv')

# Nettoyage et Préparation des Données

#### Convertir la colonne "date_added" en format datetime

In [2]:
i = 0
for date in data.iloc[: , 6]:
    if type(date) != float:
        date = datetime.strptime(date.strip(), '%B %d, %Y').date()
    data.iloc[i, 6] = date
    i += 1
data['date_added']

0       2021-09-25
1       2021-09-24
2       2021-09-24
3       2021-09-24
4       2021-09-24
           ...    
8802    2019-11-20
8803    2019-07-01
8804    2019-11-01
8805    2020-01-11
8806    2019-03-02
Name: date_added, Length: 8807, dtype: object

#### Création d'une colonne "is_movie"

In [3]:
data['is_movie'] = False

In [4]:
data.loc[data['type'] == 'Movie', 'is_movie'] = True

In [5]:
data['is_movie']

0        True
1       False
2       False
3       False
4       False
        ...  
8802     True
8803    False
8804     True
8805     True
8806     True
Name: is_movie, Length: 8807, dtype: bool

#### Remplacement des valeurs manquantes dans "country", "director' et 'cast' par 'Unknown'

In [6]:
data.loc[data['country'].isnull(), 'country'] = "Unknown"
data.loc[data['director'].isnull(), 'director'] = "Unknown"
data.loc[data['cast'].isnull(), 'cast'] = "Unknown"

In [7]:
data[['country', 'director', 'cast']]

Unnamed: 0,country,director,cast
0,United States,Kirsten Johnson,Unknown
1,South Africa,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban..."
2,Unknown,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi..."
3,Unknown,Unknown,Unknown
4,India,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K..."
...,...,...,...
8802,United States,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J..."
8803,Unknown,Unknown,Unknown
8804,United States,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ..."
8805,United States,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma..."


#### Remplacement des valeurs manquantes dans "rating' et 'duration'
par celle la moyenne des autres valeurs

In [8]:
counter = data.groupby('rating').agg(total=('rating', 'count')).reset_index().sort_values('total', ascending=False).iloc[0, 0]

data.loc[data['rating'].isnull(), 'rating'] = counter

In [9]:
counterMovie = data.loc[data['is_movie'], ].groupby('duration').agg(totalMovie=('duration', 'count')).reset_index().sort_values('totalMovie', ascending=False).iloc[0, 0]

counterNotMovie = data.loc[data['is_movie']==False, ].groupby('duration').agg(totalMovie=('duration', 'count')).reset_index().sort_values('totalMovie', ascending=False).iloc[0, 0]

data.loc[(data['is_movie'] == True & data['duration'].isnull()), 'duration'] = counterMovie
data.loc[(data['is_movie'] == False & data['duration'].isnull()), 'duration'] = counterNotMovie

#### Remplacement des valeurs manquante de 'date_added' par celle de la ligne suivante

In [10]:
followingsRow = data.loc[data['date_added'].isnull(), 'date_added'].index +1

for row in followingsRow:
    data.iloc[row-1, 6] = data.iloc[row, 6]

In [11]:
data.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
is_movie        0
dtype: int64

À présent toutes les valeurs manquantes ont été nettoyés et transformés

#### Création d'une colonne "duration_minutes" qui extrait la durée des films en minutes

In [12]:
data['duration_minutes'] = 0
data.loc[data['is_movie'], 'duration_minutes'] = data.loc[data['is_movie'], 'duration']

  data.loc[data['is_movie'], 'duration_minutes'] = data.loc[data['is_movie'], 'duration']


In [13]:
minuteList = data.loc[data['is_movie'], 'duration_minutes']
liste = []
for minutes in minuteList:
    minutes = minutes.split(' ')
    minutes.pop(1)
    minutes = ''.join(minutes)
    minutes = int(minutes)
    liste.append(minutes)
data.loc[data['is_movie'], 'duration_minutes'] = liste

In [14]:
data

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,is_movie,duration_minutes
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",True,90
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",False,0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,False,0
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",False,0
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,1 Season,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...",True,158
8803,s8804,TV Show,Zombie Dumb,Unknown,Unknown,Unknown,2019-07-01,2018,TV-Y7,1 Season,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",False,0
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,True,88
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...",True,88


# Sauvegarde du DataFrame

In [15]:
data.to_csv('../data/data_cleaned.csv', index=False)