## Impotacion de los modulos 

In [None]:
import tempfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import corr, col, count, when
from pyspark.sql import SparkSession

## Configuracion  de la instanci y conexion con el cluster

In [4]:
spark = SparkSession.builder \
    .appName("NetflixAnalysisBoto3") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.files.maxPartitionBytes", "128MB") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

In [5]:
spark.conf.get("spark.executor.cores")

'2'

 ## Lectura del  archivo csv en un dataFrame de Spark e infiere el tipo de dato que guarda por columna

In [12]:
df =spark.read.csv("netflix_titles.csv",header =True, inferSchema =True)

In [13]:
# Mostrar los resultados
df.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                NULL|       United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                NULL|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglan

In [14]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



## Consultas

### 1.Conteo de todos los registros

In [15]:
print(df.count())

8809


### 2. Numero de  elementos por tipo 

In [22]:
df_group_type = df.groupBy("type").count()
df_group_type.show()

+-------------+-----+
|         type|count|
+-------------+-----+
|         NULL|    1|
|      TV Show| 2676|
|        Movie| 6131|
|William Wyler|    1|
+-------------+-----+



### 3.Numero de elementos que son de Mexico

In [53]:
df_movies_mexico = df.filter(col("country")=="Mexico").count()
print(f'Peliculas de mexico : {df_movies_mexico}')

Peliculas de mexico : 110


### 4.Mostrar peliculas que  contienen la palabra Horror en  la columna listed_ in

In [48]:
df_horror = df.select("type","title", "country","listed_in").filter((col("type") == "Movie") & (col("listed_in").contains("Horror")))
df_horror.show()


+-----+--------------------+--------------------+--------------------+
| type|               title|             country|           listed_in|
+-----+--------------------+--------------------+--------------------+
|Movie|          Dark Skies|       United States|Horror Movies, Sc...|
|Movie|              Jaws 2|       United States|Dramas, Horror Mo...|
|Movie|              Jaws 3|       United States|Action & Adventur...|
|Movie|   Jaws: The Revenge|       United States|Action & Adventur...|
|Movie|     Krishna Cottage|               India|Action & Adventur...|
|Movie|          Ragini MMS|               India|Horror Movies, In...|
|Movie|        Ragini MMS 2|               India|Horror Movies, In...|
|Movie|        The Old Ways|       United States|       Horror Movies|
|Movie|             Boomika|                NULL|Horror Movies, In...|
|Movie|     Boomika (Hindi)|                NULL|Horror Movies, In...|
|Movie| Boomika (Malayalam)|                NULL|Horror Movies, In...|
|Movie

### 5.Filtrar  tv shows que tubieron mas de una temporada

In [54]:
df_season = df.select("title", "country","duration").filter(col("duration").contains("Seasons"))
df_season.show()

+--------------------+--------------------+---------+
|               title|             country| duration|
+--------------------+--------------------+---------+
|       Blood & Water|        South Africa|2 Seasons|
|        Kota Factory|               India|2 Seasons|
|The Great British...|      United Kingdom|9 Seasons|
|   Dear White People|       United States|4 Seasons|
|     Falsa identidad|              Mexico|2 Seasons|
|Resurrection: Ert...|              Turkey|5 Seasons|
|Love on the Spectrum|           Australia|2 Seasons|
|       Sex Education|      United Kingdom|3 Seasons|
|        Chhota Bheem|               India|3 Seasons|
|   Castle and Castle|             Nigeria|2 Seasons|
|           Nailed It|       United States|6 Seasons|
|        Numberblocks|      United Kingdom|6 Seasons|
|   Saved by the Bell|       United States|9 Seasons|
|Jack Whitehall: T...|      United Kingdom|5 Seasons|
|The World's Most ...|                NULL|2 Seasons|
|             Lucifer|      

### 6.Mostrar el total de peliculas por pais y ordenarlas del mayor al menor

In [45]:
df_pais = df.groupBy("country").count().orderBy(col("count").desc())
df_pais.show()

+--------------------+-----+
|             country|count|
+--------------------+-----+
|       United States| 2805|
|               India|  972|
|                NULL|  832|
|      United Kingdom|  419|
|               Japan|  245|
|         South Korea|  199|
|              Canada|  181|
|               Spain|  145|
|              France|  123|
|              Mexico|  110|
|               Egypt|  106|
|              Turkey|  105|
|             Nigeria|   93|
|           Australia|   87|
|              Taiwan|   81|
|           Indonesia|   79|
|              Brazil|   77|
|United Kingdom, U...|   75|
|         Philippines|   75|
|United States, Ca...|   73|
+--------------------+-----+
only showing top 20 rows

