## Pandas - модуль, призначений для обробки та аналізу даних на Python.
## Pandas побудований на основі іншої бібліотеки під назвою NumPy. Він використовує типи даних NumPy і можливо просто переходити між об'єктами обох бібліотек.

In [1]:
import pandas as pd
import numpy as np

## Pandas працює з двома основними структурами даних: Series і DataFrame

### Сприймайте Series як колонку з індексом
### У Series може бути визначений тип даних (атрибут dtype)

In [2]:
my_data = ['Adenosine', 'Thymine', 'Citosine', 'Guanine']

my_series_1 = pd.Series(my_data)

In [3]:
my_series_1

0    Adenosine
1      Thymine
2     Citosine
3      Guanine
dtype: object

In [4]:
type(my_series_1)

pandas.core.series.Series

In [5]:
my_other_data = [1, 2, 3, 4]

mah_series_2 = pd.Series(my_other_data)

In [6]:
mah_series_2

0    1
1    2
2    3
3    4
dtype: int64

In [7]:
my_other_data_2 = [1, 2, 3, 4.0]

mah_series_3 = pd.Series(my_other_data_2)

In [8]:
mah_series_3

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [9]:
mah_series_2 +  mah_series_3

0    2.0
1    4.0
2    6.0
3    8.0
dtype: float64

In [10]:
mah_series_2**2

0     1
1     4
2     9
3    16
dtype: int64

In [11]:
mah_series_3 % 2

0    1.0
1    0.0
2    1.0
3    0.0
dtype: float64

### Індекс може бути визначений окремим параметром і не обов'язково має бути послідовністю цілих чисел

In [12]:
s1 = pd.Series([1, 2], index=['A', 'B'], name = "s1")

In [13]:
s1

A    1
B    2
Name: s1, dtype: int64

In [14]:
s2 = pd.Series([3, 4], index=['A', 'B'], name='s2')

In [15]:
s2

A    3
B    4
Name: s2, dtype: int64

## Ми можемо комбінувати Series різними способами

In [16]:
comb_1 = pd.concat([s1, s2])

In [17]:
comb_1

A    1
B    2
A    3
B    4
dtype: int64

In [18]:
type(comb_1)

pandas.core.series.Series

In [19]:
comb_2 = pd.concat([s1, s2], axis=1)

In [20]:
comb_2

Unnamed: 0,s1,s2
A,1,3
B,2,4


In [21]:
type(comb_2)

pandas.core.frame.DataFrame

### comb_2 - об'єкт типу DataFrame. DataFrame це "таблиця", де кожна колонка - це окрема Series. І ці Series об'єднані разом за спільним індексом

In [22]:
pd.concat([s1, mah_series_3], axis=1)

Unnamed: 0,s1,0
A,1.0,
B,2.0,
0,,1.0
1,,2.0
2,,3.0
3,,4.0


In [23]:
df = pd.concat([s1, mah_series_3], axis=1)

In [24]:
df.columns = ["c1", "c2"]

In [25]:
df

Unnamed: 0,c1,c2
A,1.0,
B,2.0,
0,,1.0
1,,2.0
2,,3.0
3,,4.0


## В основному, звісно, ви будете працювати з DataFrame, отриманими в результаті читання тих чи інших файлів 
## У Pandas є коннектори для читання великої кількості різних форматів файлів (csv, json, xlsx, xml, parquet, orc тощо)

In [26]:
# Отримаємо поточну директорію для більш гнучкого відкриття файлів
import os
import pathlib

cwd = pathlib.Path(os.getcwd())

In [27]:
diabetes_csv_path = cwd / "diabetes/diabetes_prediction_dataset.csv"

In [28]:
diabetes_df = pd.read_csv(diabetes_csv_path)

In [29]:
pd.read_csv(diabetes_csv_path, header=None)

  pd.read_csv(diabetes_csv_path, header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
1,Female,80.0,0,1,never,25.19,6.6,140,0
2,Female,54.0,0,0,No Info,27.32,6.6,80,0
3,Male,28.0,0,0,never,27.32,5.7,158,0
4,Female,36.0,0,0,current,23.45,5.0,155,0
...,...,...,...,...,...,...,...,...,...
99996,Female,80.0,0,0,No Info,27.32,6.2,90,0
99997,Female,2.0,0,0,No Info,17.37,6.5,100,0
99998,Male,66.0,0,0,former,27.83,5.7,155,0
99999,Female,24.0,0,0,never,35.42,4.0,100,0


In [30]:
diabetes_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


### DataFrame можна фільтрувати доволі різним чином. Можна обирати тільки потрібні вам колонки, можна фільтрувати їх за значеннями певної колонки тощо

In [31]:
diabetes_df["gender"]

0        Female
1        Female
2          Male
3        Female
4          Male
          ...  
99995    Female
99996    Female
99997      Male
99998    Female
99999    Female
Name: gender, Length: 100000, dtype: object

In [32]:
diabetes_df[["gender", "age", "smoking_history"]]

Unnamed: 0,gender,age,smoking_history
0,Female,80.0,never
1,Female,54.0,No Info
2,Male,28.0,never
3,Female,36.0,current
4,Male,76.0,current
...,...,...,...
99995,Female,80.0,No Info
99996,Female,2.0,No Info
99997,Male,66.0,former
99998,Female,24.0,never


In [33]:
diabetes_df[diabetes_df["gender"] == "Female"]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,No Info,24.60,4.8,145,0
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [34]:
filtered_women_df = diabetes_df[(diabetes_df["gender"] == "Female") & (diabetes_df["hypertension"] != 0) & (diabetes_df["diabetes"] == 1)]

In [35]:
filtered_women_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
104,Female,80.0,1,0,never,27.32,6.8,280,1
350,Female,52.0,1,0,never,50.30,6.6,155,1
466,Female,79.0,1,0,former,27.32,6.5,159,1
531,Female,49.0,1,0,not current,36.93,8.8,155,1
727,Female,38.0,1,0,not current,27.32,6.1,160,1
...,...,...,...,...,...,...,...,...,...
99739,Female,65.0,1,0,never,35.00,7.0,145,1
99792,Female,80.0,1,0,never,21.33,6.8,240,1
99845,Female,43.0,1,0,never,34.21,6.5,160,1
99935,Female,65.0,1,1,never,33.55,8.2,140,1


In [36]:
filtered_women_df.query("blood_glucose_level > 200")

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
104,Female,80.0,1,0,never,27.32,6.8,280,1
882,Female,75.0,1,0,No Info,27.32,6.6,240,1
1082,Female,53.0,1,0,never,41.76,6.8,300,1
1135,Female,62.0,1,1,never,43.16,8.8,280,1
1185,Female,71.0,1,0,No Info,42.44,6.8,220,1
...,...,...,...,...,...,...,...,...,...
98353,Female,76.0,1,0,never,38.59,6.6,300,1
99067,Female,73.0,1,0,never,45.65,6.0,240,1
99149,Female,54.0,1,0,ever,31.86,5.7,280,1
99537,Female,67.0,1,0,ever,34.30,5.7,300,1


### Найголовніше призначення Pandas - EDA (exploratory data analysis). Отже, він може показати багато статистичних і метаданих про ваші дані

In [37]:
diabetes_df.head(20)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [38]:
diabetes_df.tail(15)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99985,Male,25.0,0,0,No Info,27.32,5.8,145,0
99986,Female,63.0,0,0,never,29.01,4.8,145,0
99987,Female,23.0,0,0,never,17.87,5.8,100,0
99988,Male,19.0,0,0,No Info,25.75,5.0,145,0
99989,Female,26.0,0,0,No Info,27.32,5.0,158,0
99990,Male,39.0,0,0,No Info,27.32,6.1,100,0
99991,Male,22.0,0,0,current,29.65,6.0,80,0
99992,Female,26.0,0,0,never,34.34,6.5,160,0
99993,Female,40.0,0,0,never,40.69,3.5,155,0
99994,Female,36.0,0,0,No Info,24.6,4.8,145,0


In [39]:
diabetes_df.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

In [40]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [41]:
diabetes_df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


### В Pandas можливі операції з окремими рядками

In [42]:
diabetes_df.loc[0]

gender                 Female
age                      80.0
hypertension                0
heart_disease               1
smoking_history         never
bmi                     25.19
HbA1c_level               6.6
blood_glucose_level       140
diabetes                    0
Name: 0, dtype: object

In [57]:
diabetes_df.iloc[10]

gender                 Female
age                      53.0
hypertension                0
heart_disease               0
smoking_history         never
bmi                     27.32
HbA1c_level               6.1
blood_glucose_level        85
diabetes                    0
Name: 10, dtype: object

In [43]:
diabetes_df.loc[0] = diabetes_df.loc[1]

In [44]:
diabetes_df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,54.0,0,0,No Info,27.32,6.6,80,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0


In [45]:
diabetes_df.loc[0:7, ["heart_disease", "bmi"]]

Unnamed: 0,heart_disease,bmi
0,0,27.32
1,0,27.32
2,0,27.32
3,0,23.45
4,1,20.14
5,0,27.32
6,0,19.31
7,0,23.86


### Ви можете так само змінювати індекси у DataFrame як забажаєте

In [49]:
diabetes_df.set_index("smoking_history")

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
smoking_history,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No Info,Female,54.0,0,0,27.32,6.6,80,0
No Info,Female,54.0,0,0,27.32,6.6,80,0
never,Male,28.0,0,0,27.32,5.7,158,0
current,Female,36.0,0,0,23.45,5.0,155,0
current,Male,76.0,1,1,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...
No Info,Female,80.0,0,0,27.32,6.2,90,0
No Info,Female,2.0,0,0,17.37,6.5,100,0
former,Male,66.0,0,0,27.83,5.7,155,0
never,Female,24.0,0,0,35.42,4.0,100,0


In [50]:
diabetes_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,54.0,0,0,No Info,27.32,6.6,80,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [51]:
reindexed_df = diabetes_df.set_index("smoking_history")

In [56]:
reindexed_df.loc["never"]

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
smoking_history,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
never,Male,28.0,0,0,27.32,5.7,158,0
never,Female,20.0,0,0,27.32,6.6,85,0
never,Female,44.0,0,0,19.31,6.5,200,1
never,Male,42.0,0,0,33.64,4.8,145,0
never,Female,32.0,0,0,27.32,5.0,100,0
...,...,...,...,...,...,...,...,...
never,Female,63.0,0,0,29.01,4.8,145,0
never,Female,23.0,0,0,17.87,5.8,100,0
never,Female,26.0,0,0,34.34,6.5,160,0
never,Female,40.0,0,0,40.69,3.5,155,0


In [68]:
diabetes_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,54.0,0,0,No Info,27.32,6.6,80,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


### DataFrame, так само, можна комбінувати один з одним

In [90]:
us_video_csv_path = cwd / "youtube_trends/USvideos.csv"
us_cat_json_path = cwd / "youtube_trends/US_category_id.json"

In [111]:
gb_video_csv_path = cwd / "youtube_trends/GBvideos.csv"
gb_cat_json_path = cwd / "youtube_trends/GB_category_id.json"

In [91]:
us_video_df = pd.read_csv(us_video_csv_path)

In [113]:
gb_video_df = pd.read_csv(gb_video_csv_path)

In [114]:
gb_video_df.head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,Jw1Y-zhQURU,17.14.11,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10T07:38:29.000Z,"christmas|""john lewis christmas""|""john lewis""|...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False,Click here to continue the story and make your...
1,3s1rvMFUweQ,17.14.11,Taylor Swift: …Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12T06:24:44.000Z,"SNL|""Saturday Night Live""|""SNL Season 43""|""Epi...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False,Musical guest Taylor Swift performs …Ready for...


In [117]:
gb_video_df.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,38916.0,38916.0,38916.0,38916.0,38916.0
mean,16.827937,5911944.0,134519.6,7612.56,13088.35
std,7.752728,19001210.0,349989.3,50956.83,50667.4
min,1.0,851.0,0.0,0.0,0.0
25%,10.0,251527.2,5897.0,200.0,679.0
50%,20.0,981889.0,25182.5,821.0,2478.0
75%,24.0,3683628.0,114089.2,3357.5,9241.5
max,43.0,424538900.0,5613827.0,1944971.0,1626501.0


In [96]:
us_video_df.head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."


In [118]:
us_video_df.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,40949.0,40949.0,40949.0,40949.0,40949.0
mean,19.972429,2360785.0,74266.7,3711.401,8446.804
std,7.568327,7394114.0,228885.3,29029.71,37430.49
min,1.0,549.0,0.0,0.0,0.0
25%,17.0,242329.0,5424.0,202.0,614.0
50%,24.0,681861.0,18091.0,631.0,1856.0
75%,25.0,1823157.0,55417.0,1938.0,5755.0
max,43.0,225211900.0,5613827.0,1674420.0,1361580.0


In [120]:
anglo_sax_df = pd.concat([gb_video_df, us_video_df])

In [121]:
anglo_sax_df.head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,Jw1Y-zhQURU,17.14.11,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10T07:38:29.000Z,"christmas|""john lewis christmas""|""john lewis""|...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False,Click here to continue the story and make your...
1,3s1rvMFUweQ,17.14.11,Taylor Swift: …Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12T06:24:44.000Z,"SNL|""Saturday Night Live""|""SNL Season 43""|""Epi...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False,Musical guest Taylor Swift performs …Ready for...


In [122]:
anglo_sax_df.describe()

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,79865.0,79865.0,79865.0,79865.0,79865.0
mean,18.440205,4091166.0,103626.2,5612.328,10708.5
std,7.818304,14391250.0,295726.5,41244.62,44436.79
min,1.0,549.0,0.0,0.0,0.0
25%,10.0,246417.0,5642.0,201.0,642.0
50%,22.0,796106.0,20922.0,712.0,2099.0
75%,24.0,2535704.0,78248.0,2527.0,7220.0
max,43.0,424538900.0,5613827.0,1944971.0,1626501.0


In [93]:
us_cat_df = pd.read_json(us_cat_json_path)

In [95]:
us_cat_df.head(2)

Unnamed: 0,kind,etag,items
0,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
1,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."


In [105]:
us_cat_df = pd.json_normalize(us_cat_df["items"])

In [132]:
us_cat_df = us_cat_df.rename(columns={"id": "category_id"})

In [138]:
us_video_df.dtypes

video_id                  object
trending_date             object
title                     object
channel_title             object
category_id                int64
publish_time              object
tags                      object
views                      int64
likes                      int64
dislikes                   int64
comment_count              int64
thumbnail_link            object
comments_disabled           bool
ratings_disabled            bool
video_error_or_removed      bool
description               object
dtype: object

In [143]:
us_cat_df.category_id = us_cat_df.category_id.astype("int64")

In [145]:
joined_df = pd.merge(us_video_df, us_cat_df, how="inner", on="category_id")

In [151]:
joined_df[(joined_df["category_id"] == 22) & (joined_df["snippet.assignable"])].head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,...,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,kind,etag,snippet.channelId,snippet.title,snippet.assignable
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,...,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True
1,0mlNzVSJrT0,17.14.11,Me-O Cats Commercial,Nobrand,22,2017-04-21T06:47:32.000Z,"cute|""cats""|""thai""|""eggs""",98966,2486,184,...,https://i.ytimg.com/vi/0mlNzVSJrT0/default.jpg,False,False,False,Kittens come out of the eggs in a Thai commerc...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True


## Трансформації можуть бути доволі складними. Бажано планувати такі трансформації, щоб їх можна було описати через apply (це метод, котрий застосовує певну функцію на кожен рядок вашого DataFrame)

In [166]:
from datetime import datetime, timedelta

joined_df["parsed_date"] = joined_df["trending_date"].apply(lambda column_value: datetime.strptime(column_value, "%y.%d.%m"))

In [169]:
joined_df[joined_df["parsed_date"] - datetime(2017, 11, 14) < timedelta(days=2)].head(2)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,...,comments_disabled,ratings_disabled,video_error_or_removed,description,kind,etag,snippet.channelId,snippet.title,snippet.assignable,parsed_date
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,...,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14
1,0mlNzVSJrT0,17.14.11,Me-O Cats Commercial,Nobrand,22,2017-04-21T06:47:32.000Z,"cute|""cats""|""thai""|""eggs""",98966,2486,184,...,False,False,False,Kittens come out of the eggs in a Thai commerc...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14


In [170]:
joined_df["parsed_tags"] = joined_df["tags"].apply(lambda col: col.split("|"))

In [172]:
joined_df.explode("parsed_tags")

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,...,ratings_disabled,video_error_or_removed,description,kind,etag,snippet.channelId,snippet.title,snippet.assignable,parsed_date,parsed_tags
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,...,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14,SHANtell martin
1,0mlNzVSJrT0,17.14.11,Me-O Cats Commercial,Nobrand,22,2017-04-21T06:47:32.000Z,"cute|""cats""|""thai""|""eggs""",98966,2486,184,...,False,False,Kittens come out of the eggs in a Thai commerc...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14,cute
1,0mlNzVSJrT0,17.14.11,Me-O Cats Commercial,Nobrand,22,2017-04-21T06:47:32.000Z,"cute|""cats""|""thai""|""eggs""",98966,2486,184,...,False,False,Kittens come out of the eggs in a Thai commerc...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14,"""cats"""
1,0mlNzVSJrT0,17.14.11,Me-O Cats Commercial,Nobrand,22,2017-04-21T06:47:32.000Z,"cute|""cats""|""thai""|""eggs""",98966,2486,184,...,False,False,Kittens come out of the eggs in a Thai commerc...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14,"""thai"""
1,0mlNzVSJrT0,17.14.11,Me-O Cats Commercial,Nobrand,22,2017-04-21T06:47:32.000Z,"cute|""cats""|""thai""|""eggs""",98966,2486,184,...,False,False,Kittens come out of the eggs in a Thai commerc...,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/xId8RX7vRN8rqkbYZ...",UCBR8-60-B28hp2BmDPdntcQ,People & Blogs,True,2017-11-14,"""eggs"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40948,V6ElE2xs48c,18.06.06,Game of Zones - S5:E5: The Isle of Van Gundy,Bleacher Report,43,2018-05-10T21:01:22.000Z,"bleacher report|""br""|""nba""|""Stan Van Gundy""|""J...",1351321,22587,616,...,False,False,"On an island full of Van Gundys, Stan gets tal...",youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/0n9MJVCDLpA8q7aiG...",UCBR8-60-B28hp2BmDPdntcQ,Shows,False,2018-06-06,"""playoffs"""
40948,V6ElE2xs48c,18.06.06,Game of Zones - S5:E5: The Isle of Van Gundy,Bleacher Report,43,2018-05-10T21:01:22.000Z,"bleacher report|""br""|""nba""|""Stan Van Gundy""|""J...",1351321,22587,616,...,False,False,"On an island full of Van Gundys, Stan gets tal...",youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/0n9MJVCDLpA8q7aiG...",UCBR8-60-B28hp2BmDPdntcQ,Shows,False,2018-06-06,"""finals"""
40948,V6ElE2xs48c,18.06.06,Game of Zones - S5:E5: The Isle of Van Gundy,Bleacher Report,43,2018-05-10T21:01:22.000Z,"bleacher report|""br""|""nba""|""Stan Van Gundy""|""J...",1351321,22587,616,...,False,False,"On an island full of Van Gundys, Stan gets tal...",youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/0n9MJVCDLpA8q7aiG...",UCBR8-60-B28hp2BmDPdntcQ,Shows,False,2018-06-06,"""lebron"""
40948,V6ElE2xs48c,18.06.06,Game of Zones - S5:E5: The Isle of Van Gundy,Bleacher Report,43,2018-05-10T21:01:22.000Z,"bleacher report|""br""|""nba""|""Stan Van Gundy""|""J...",1351321,22587,616,...,False,False,"On an island full of Van Gundys, Stan gets tal...",youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/0n9MJVCDLpA8q7aiG...",UCBR8-60-B28hp2BmDPdntcQ,Shows,False,2018-06-06,"""curry"""


### Pandas дозволяє групувати дані між собою та рахувати ті чи інші функції агрегації. В Pandas це називається [split-apply-combine](https://pandas.pydata.org/docs/user_guide/groupby.html) процес

In [155]:
joined_df.groupby("category_id").sum()

  joined_df.groupby("category_id").sum()


Unnamed: 0_level_0,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,snippet.assignable
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,7284156721,165997476,6075148,17887060,28,14,13,2345
2,520690717,4245656,243010,784447,5,10,0,384
10,40132892190,1416838584,51179008,125296396,9,24,0,6472
15,764651989,19370702,527379,2660705,4,0,0,920
17,4404456673,98621211,5133551,11192155,28,17,2,2174
19,343557084,4836246,340427,911511,0,0,0,402
20,2141218625,69038284,9184466,14740713,8,0,0,817
22,4917191726,186615999,10187901,24778032,66,37,0,3210
23,5117426208,216346746,7230391,22545582,2,0,0,3457
24,20604388195,530516491,42987663,73566498,196,30,8,9964
