
#### About this Dataset: Netflix is one of the most popular media and video streaming platforms.
#### They have over 8000 movies or tv shows available on their platform, as of mid-2021, they have over 200M Subscribers globally.
#### This tabular dataset consists of listings of all the movies and tv shows available on Netflix,
#### along with details such as - cast, directors, ratings, release year, duration, etc.


In [1]:
####  1.Show_id - Số thứ tự danh sách phim và tv show
#### 2.Type - Thể loại phim và tv show
#### 3.Title - Tiêu đề phim và tv show
#### 4.Director - Đạo diển
#### 5.Cast - Diễn viên
#### 6.Country - Quốc gai
#### 7.Date_added - Ngày thêm
#### 8.Release_year - Ngày công chiếu
#### 9.Rating - Lượt đánh giá
#### 10.Duration - Khoản thời gian
#### 11.Listed_in - Liệt kê trong
#### 12.Description - Mô tả 


In [2]:
#### import library
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('project').getOrCreate()

In [4]:
### Loading dataset
from pyspark.sql.types import StructType, StructField,IntegerType,StringType,DateType

In [5]:
netflix_schema = StructType([
     StructField('show_id',StringType(),True),
     StructField('type',StringType(),True),
     StructField('title',StringType(),True),
     StructField('director',StringType(),True),
     StructField('cast',StringType(),True),
     StructField('country',StringType(),True),
     StructField('date_added',StringType(),True),
     StructField('release_year',StringType(),True),
     StructField('rating',StringType(),True),
     StructField('duration',StringType(),True),
    StructField('listed_in',StringType(),True),
    StructField('description',StringType(),True)
    
])

netflix_df = spark.read.csv('netflix.csv',header=True,schema=netflix_schema)

In [6]:
### hiển thị 5 dòng đầu tiên
#netflix_df.show(5)
for i in netflix_df.head(5):
    print(i)
    print('\n')

Row(show_id='s1', type='Movie', title='Dick Johnson Is Dead', director='Kirsten Johnson', cast=None, country='United States', date_added='September 25, 2021', release_year='2020', rating='PG-13', duration='90 min', listed_in='Documentaries', description='As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.')


Row(show_id='s2', type='TV Show', title='Blood & Water', director=None, cast='Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng', country='South Africa', date_added='September 24, 2021', release_year='2021', rating='TV-MA', duration='2 Seasons', listed_in='International TV Shows, TV Dramas, TV Mysteries', description='After crossing

In [7]:
### xem kiểu dữ liệu
netflix_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [8]:
## check value null and nan

from pyspark.sql.functions import col, count,when,isnan
netflix_df.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in netflix_df.columns]).show()

+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|show_id|type|title|director|cast|country|date_added|release_year|rating|duration|listed_in|description|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|      0|   1|    2|    2636| 826|    832|        13|           2|     6|       5|        3|          3|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+



In [9]:
## drop column

In [10]:
df3 = netflix_df.drop(netflix_df.director,netflix_df.show_id,netflix_df.description)

In [11]:
df3.printSchema()


root
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)



In [12]:
df3=df3.na.drop()

In [13]:
# check nan
df3.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in df3.columns]).show()

+----+-----+----+-------+----------+------------+------+--------+---------+
|type|title|cast|country|date_added|release_year|rating|duration|listed_in|
+----+-----+----+-------+----------+------------+------+--------+---------+
|   0|    0|   0|      0|         0|           0|     0|       0|        0|
+----+-----+----+-------+----------+------------+------+--------+---------+



In [14]:
df3.select([count(c).alias(c) for c in df3.columns]).show()

+----+-----+----+-------+----------+------------+------+--------+---------+
|type|title|cast|country|date_added|release_year|rating|duration|listed_in|
+----+-----+----+-------+----------+------------+------+--------+---------+
|7290| 7290|7290|   7290|      7290|        7290|  7290|    7290|     7290|
+----+-----+----+-------+----------+------------+------+--------+---------+



In [15]:
## rename column
df3=df3.withColumnRenamed('type','Type').withColumnRenamed('title','Title').withColumnRenamed('cast','Cast') \
    .withColumnRenamed('country','Country').withColumnRenamed('date_added','Date_added').withColumnRenamed('release_year','Release_year') \
    .withColumnRenamed('rating','Rating').withColumnRenamed('duration','Duration').withColumnRenamed('listed_in','Listed_in')

In [16]:
for i in df3.head(5):
    print(i)
    print('\n')

Row(Type='TV Show', Title='Blood & Water', Cast='Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng', Country='South Africa', Date_added='September 24, 2021', Release_year='2021', Rating='TV-MA', Duration='2 Seasons', Listed_in='International TV Shows, TV Dramas, TV Mysteries')


Row(Type='TV Show', Title='Kota Factory', Cast='Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar', Country='India', Date_added='September 24, 2021', Release_year='2021', Rating='TV-MA', Duration='2 Seasons', Listed_in='International TV Shows, Romantic TV Shows, TV Comedies')


Row(Type='Movie', Title='Sankofa', Cast='Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, 

In [17]:
for i in df3.take(5):
    print(i)
    print('\n')

Row(Type='TV Show', Title='Blood & Water', Cast='Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng', Country='South Africa', Date_added='September 24, 2021', Release_year='2021', Rating='TV-MA', Duration='2 Seasons', Listed_in='International TV Shows, TV Dramas, TV Mysteries')


Row(Type='TV Show', Title='Kota Factory', Cast='Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar', Country='India', Date_added='September 24, 2021', Release_year='2021', Rating='TV-MA', Duration='2 Seasons', Listed_in='International TV Shows, Romantic TV Shows, TV Comedies')


Row(Type='Movie', Title='Sankofa', Cast='Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, 

In [24]:
# format String trim in column
from pyspark.sql.functions import trim, initcap,col
#df3 = df3.select([initcap(j) for j in df3.columns])
df3 = df3.withColumn("Type",trim(col("Type")))
df3 = df3.withColumn("Title",trim(col("Title")))
df3 = df3.withColumn("Cast",trim(col("Cast"))) 
df3 = df3.withColumn("Country",trim(col("Country")))
df3 = df3.withColumn("Date_added",trim(col("Date_added")))
df3 = df3.withColumn("Release_year",trim(col("Release_year")))
df3 = df3.withColumn("Rating",trim(col("Rating")))
df3 = df3.withColumn("Duration",trim(col("Duration")))

In [21]:
df3.show()

+-------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+
|   Type|               Title|                Cast|             Country|        Date_added|Release_year|Rating| Duration|           Listed_in|
+-------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+
|TV Show|       Blood & Water|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|
|TV Show|        Kota Factory|Mayur More, Jiten...|               India|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|
|  Movie|             Sankofa|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA|  125 min|Dramas, Independe...|
|TV Show|The Great British...|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|

In [25]:
# format String initcap in column
df3 = df3.withColumn("Type",initcap(col("Type")))
df3 = df3.withColumn("Title",initcap(col("Title")))
df3 = df3.withColumn("Cast",initcap(col("Cast"))) 
df3 = df3.withColumn("Country",initcap(col("Country")))
df3 = df3.withColumn("Date_added",initcap(col("Date_added")))
df3 = df3.withColumn("Release_year",initcap(col("Release_year")))
df3 = df3.withColumn("Rating",initcap(col("Rating")))
df3 = df3.withColumn("Duration",initcap(col("Duration")))

In [26]:
df3.show()

+-------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+
|   Type|               Title|                Cast|             Country|        Date_added|Release_year|Rating| Duration|           Listed_in|
+-------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+
|Tv Show|       Blood & Water|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| Tv-ma|2 Seasons|International TV ...|
|Tv Show|        Kota Factory|Mayur More, Jiten...|               India|September 24, 2021|        2021| Tv-ma|2 Seasons|International TV ...|
|  Movie|             Sankofa|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| Tv-ma|  125 Min|Dramas, Independe...|
|Tv Show|The Great British...|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| Tv-14|9 Seasons|British TV Shows,...|

In [None]:
from pyspark.sql.functions import *
df4 = df3.groupBy("type").count()
df4.show()


In [None]:
def pieplot(df, col, lim=10, yname=None):
    
    '''
    This function makes a pie chart for the Spark dataframe df 
    for categorical column col only selecting top categories 
    as specified by lim. 
    '''
    
    # Grouping by the categories, counting on each categories 
    # and ordering them by the count
    classes = df.groupBy(col).count().orderBy('count', ascending=False)
    
    # Take first 'lim' number of rows and convert to pandas  
    pd_df = classes.limit(lim).toPandas()
    
    # Making plot 
    pd_df.plot(kind='pie', x=col, y='count', \
           labels=pd_df[col], legend=False)
    plt.ylabel(None)
    plt.show()

In [None]:
pieplot(df3, 'type')

In [None]:
df5 = df3.select('type','country','release_year')
df5.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = netflix_df.toPandas()

In [None]:
for col in list(df.columns):
    unique_value = df[col].unique()
    if len(unique_value) < 15:
        print('\n')
        print(col + ': ' + str(len(unique_value)) + 'Unique values ')
        print(distinct_values)
    else:
        print("\n")
        print(col + ': ' + str(len(unique_value)) + 'Unique values')

In [None]:
####1- Handling missing values
####2- Creation of new columns for date
####3- Deleting variables
####4- Extraction of words in texts-feature extration

In [None]:
### Data Clearning
######1- Handling missing values

In [None]:
import pandas as pd
def func_calc_percentual(df):
    mis_val = df.isnull().sum()

    mis_val_percent = 100 * mis_val / len(df)
    mis_val_dtype = df.dtypes
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    return mis_val_table.rename(
    columns = {0 : 'Valores Ausentes', 1 : '% de Valores Ausentes'})
print(func_calc_percentual(df))

In [None]:
netflix_df.select([count(when(~isnan(c) & col(c).isNotNull() , c)).alias(c) for c in netflix_df.columns]).show()
