In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv('../data/Most_popular_1000_Youtube_videos.csv')
data.head()

Unnamed: 0,rank,Video,Video views,Likes,Dislikes,Category,published
0,1,Lil Nas X - Old Town Road (Official Movie) ft....,54071677,3497955,78799.0,Music,2019
1,2,"20 Tennis shots if they were not filmed, NOBOD...",3471237,19023,859.0,,2017
2,3,JoJo Siwa - Karma (Official Video),34206747,293563,,Music,2024
3,4,David Kushner - Daylight (Official Music Video),18558390,680732,,Music,2023
4,5,Wiz Khalifa - See You Again ft. Charlie Puth [...,6547981039,44428537,,Music,2015


### Standardisation des noms de colonnes

In [3]:
def standardisation(nom_colonne):
    nom_colonne = str.lower(nom_colonne).replace(' ', '_')
    return str(nom_colonne)

for column in data.columns:
    data.rename(columns={
        column:standardisation(column)
    }, inplace=True)
data

Unnamed: 0,rank,video,video_views,likes,dislikes,category,published
0,1,Lil Nas X - Old Town Road (Official Movie) ft....,54071677,3497955,78799,Music,2019
1,2,"20 Tennis shots if they were not filmed, NOBOD...",3471237,19023,859,,2017
2,3,JoJo Siwa - Karma (Official Video),34206747,293563,,Music,2024
3,4,David Kushner - Daylight (Official Music Video),18558390,680732,,Music,2023
4,5,Wiz Khalifa - See You Again ft. Charlie Puth [...,6547981039,44428537,,Music,2015
...,...,...,...,...,...,...,...
995,996,New Champ Kayn/Rhaast Leak for LOL (Moobeat cr...,847249,1857,173,People & Blogs,2017
996,997,Ford Mustang Launch (street),1001605,2214,27,Autos & Vehicles,2008
997,998,Eminem is gay - The Interview,2718939,43492,0,Entertainment,2014
998,999,Yakuza OST - Baka Mitai (ばかみたい) Kiryu full ver...,52890986,850425,0,Gaming,2017


### Vérification du type

In [4]:
data.dtypes

rank            int64
video          object
video_views    object
likes          object
dislikes       object
category       object
published       int64
dtype: object

In [5]:
columnList = ['video_views', 'likes', 'dislikes']

for column in columnList:
    data[column] = pd.to_numeric(data[column].str.replace(',', ''), errors='coerce')

data[columnList]

Unnamed: 0,video_views,likes,dislikes
0,54071677,3497955,78799.0
1,3471237,19023,859.0
2,34206747,293563,
3,18558390,680732,
4,6547981039,44428537,
...,...,...,...
995,847249,1857,173.0
996,1001605,2214,27.0
997,2718939,43492,0.0
998,52890986,850425,0.0


In [6]:
data.dtypes

rank             int64
video           object
video_views      int64
likes            int64
dislikes       float64
category        object
published        int64
dtype: object

### Vérification des Outliers

In [7]:
data.describe()

Unnamed: 0,rank,video_views,likes,dislikes,published
count,1000.0,1000.0,1000.0,527.0,1000.0
mean,500.5,24534350.0,368545.1,2322.324478,2019.1
std,288.819436,251257000.0,1629418.0,9653.17036,5.384328
min,1.0,44939.0,433.0,0.0,2005.0
25%,250.75,981569.0,9427.25,200.0,2017.0
50%,500.5,2341652.0,30262.0,477.0,2021.0
75%,750.25,11626380.0,164985.8,1469.0,2024.0
max,1000.0,6547981000.0,44428540.0,178042.0,2025.0


Aucun outlier détecté

### Vérification des Doublons

In [8]:
data.loc[data['video'].duplicated(keep=False), ]

Unnamed: 0,rank,video,video_views,likes,dislikes,category,published
17,18,MILLION DOLLAR BABY,11894076,226375,,Music,2024
196,197,Tiryakinim,6332540,64248,,Music,2024
205,206,Tiryakinim,8170818,45890,,Music,2024
215,216,Bıraktığın Gibi Burdayım,1291926,10366,,Music,2024
267,268,Bıraktığın Gibi Burdayım,1889571,13340,,Music,2024
500,501,MILLION DOLLAR BABY,483750,30078,,Music,2024


In [9]:
data.drop_duplicates(subset=['video'], ignore_index=True, inplace=True)

In [10]:
data.loc[data['video'].duplicated(keep=False), ]

Unnamed: 0,rank,video,video_views,likes,dislikes,category,published


### Valeurs manquantes

In [11]:
data.isnull().sum()

rank             0
video            0
video_views      0
likes            0
dislikes       470
category        18
published        0
dtype: int64

In [12]:
data.loc[data['dislikes'].isnull(), 'dislikes'] = 0
data.loc[data['category'].isnull(), 'category'] = 'Unknown'

In [13]:
data.isnull().sum()

rank           0
video          0
video_views    0
likes          0
dislikes       0
category       0
published      0
dtype: int64

### Réinitialisation des rangs

In [14]:
data['rank'] = np.arange(1, len(data)+1)

### Sauvegarde du DataFrame obtenu

In [15]:
data.to_csv('../data/data_cleaned.csv', index=False)