# 001 List all the directors who directed a 'Comedy' movie in a leap year

In [1]:
spark

org.apache.spark.sql.SparkSession@32e2901f

In [2]:
import org.apache.spark.sql.functions._

In [3]:
import org.apache.spark.sql.types._

In [4]:
import org.apache.spark.sql.expressions.Window

In [5]:
val spark=SparkSession.builder().appName("movies app").master("local[*]").getOrCreate()

spark = org.apache.spark.sql.SparkSession@32e2901f


org.apache.spark.sql.SparkSession@32e2901f

In [6]:
import spark.implicits._

# UDFs

In [7]:
val getYear=udf((year:String)=>{
    val p="\\d{4}".r
    val yr=p.findFirstMatchIn(year).getOrElse("").toString
    if(yr!=""){
        yr.toInt
    }
    else{
        0
    }
})

getYear = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))


UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))

In [8]:
val isLeapYear=udf((year:Long)=>{
    if(year%4==0 && year%100==0){
        true
    }
    else if(year%400==0){
        true
    }
    else{
        false
    }
        
})

isLeapYear = UserDefinedFunction(<function1>,BooleanType,Some(List(LongType)))


UserDefinedFunction(<function1>,BooleanType,Some(List(LongType)))

# loading movies

In [9]:
val Movie=spark.read.option("header","true").csv("/home/use2cobadmin/practice_data/imdb/imdb_csv/Movie.csv")
.dropDuplicates()

Movie = [index: string, MID: string ... 4 more fields]


[index: string, MID: string ... 4 more fields]

# printing table structure to know what are the columns and it's data type

In [10]:
Movie.printSchema

root
 |-- index: string (nullable = true)
 |-- MID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- num_votes: string (nullable = true)



# Displaying first two records to see sample data
# observation 1: year,rating,num_votes,index columns data types should be Integer
# observation 2: also we need do trim to remove any extra spaces

In [11]:
Movie.show(2)

+-----+---------+-----------------+----+------+---------+
|index|      MID|            title|year|rating|num_votes|
+-----+---------+-----------------+----+------+---------+
|   85|tt3224288|Beyond the Clouds|2017|   7.0|     1123|
|  913|tt2178508|   Son of Sardaar|2012|   4.0|     7895|
+-----+---------+-----------------+----+------+---------+
only showing top 2 rows



# displaying total records in the table

In [12]:
Movie.count

3475

# as per observation need to change the data type

In [13]:
val movies=Movie.withColumn("year",getYear(Movie("year")).cast(IntegerType))
.withColumn("title",trim(Movie("title")))
.withColumn("MID",trim(Movie("MID")))
.withColumn("rating",trim(Movie("rating")).cast(DoubleType))
.withColumn("num_votes",trim(Movie("num_votes")).cast(LongType))
.withColumn("index",trim(Movie("index")).cast(IntegerType))
.dropDuplicates("MID")

movies = [index: int, MID: string ... 4 more fields]


[index: int, MID: string ... 4 more fields]

# now print the schema to verify columns and it's data type

In [14]:
movies.printSchema

root
 |-- index: integer (nullable = true)
 |-- MID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = false)
 |-- rating: double (nullable = true)
 |-- num_votes: long (nullable = true)



# print the total number of records in the table after data cleaning

In [15]:
movies.count

3475

# show the first two records as a sample data

In [16]:
movies.show(2)

+-----+---------+--------------------+----+------+---------+
|index|      MID|               title|year|rating|num_votes|
+-----+---------+--------------------+----+------+---------+
| 3008|tt0036077|              Kismet|1943|   7.5|       71|
| 2969|tt0102701|Prahaar: The Fina...|1991|   7.9|     1696|
+-----+---------+--------------------+----+------+---------+
only showing top 2 rows



# Load the Person.csv file and it has header and store theses records in Person variable

In [17]:
val Person=spark.read.option("header","true").csv("/home/use2cobadmin/practice_data/imdb/imdb_csv/Person.csv")

Person = [index: string, PID: string ... 2 more fields]


[index: string, PID: string ... 2 more fields]

# Print the schema of Person variable,first two records and it's count

In [18]:
Person.show(2)
Person.printSchema
val p1Count=Person.count

+-----+---------+---------------+------+
|index|      PID|           Name|Gender|
+-----+---------+---------------+------+
|    0|nm0000288| Christian Bale|  Male|
|    1|nm0000949| Cate Blanchett|Female|
+-----+---------+---------------+------+
only showing top 2 rows

root
 |-- index: string (nullable = true)
 |-- PID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)



p1Count = 38285


38285

# Data Cleaning
# change the data types as per data and also remove extra spaces from data

In [19]:
val persons=Person.withColumn("PID",trim(Person("PID")))
.withColumn("Name",trim(Person("Name")))
.withColumn("Gender",trim(Person("Gender")))
.withColumn("index",trim(Person("index")).cast(IntegerType))
.dropDuplicates("PID")

persons = [index: int, PID: string ... 2 more fields]


[index: int, PID: string ... 2 more fields]

# show the sample data, schema and it's count of variable persons

In [20]:
persons.show(2)
persons.printSchema
val p2Count=persons.count

+-----+---------+-------------+------+
|index|      PID|         Name|Gender|
+-----+---------+-------------+------+
|15793|nm0022343|       Alpana|Female|
| 3121|nm0028416|Martin Andris|  Male|
+-----+---------+-------------+------+
only showing top 2 rows

root
 |-- index: integer (nullable = true)
 |-- PID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)



p2Count = 37566


37566

# print the difference between Person and persons records count

In [21]:
val diff=p1Count-p2Count
println(diff+" duplicates found")

719 duplicates found


diff = 719


719

# Load the M_Director.csv file and it's has header

In [22]:
val M_Director=spark.read.option("header","true").csv("/home/use2cobadmin/practice_data/imdb/imdb_csv/M_Director.csv")

M_Director = [index: string, MID: string ... 2 more fields]


[index: string, MID: string ... 2 more fields]

# display sample data, schema and count M_Director variable

In [23]:
M_Director.printSchema
M_Director.show(2)
val m_d1Count=M_Director.count

root
 |-- index: string (nullable = true)
 |-- MID: string (nullable = true)
 |-- PID: string (nullable = true)
 |-- ID: string (nullable = true)

+-----+---------+---------+---+
|index|      MID|      PID| ID|
+-----+---------+---------+---+
|    0|tt2388771|nm0785227|  0|
|    1|tt5164214|nm0002657|  1|
+-----+---------+---------+---+
only showing top 2 rows



m_d1Count = 3475


3475

# Data cleaning for M_Director

# remove duplicate records, change data type from string to appropriate data type and perform trim operation on each column separately

In [24]:
val m_directors=M_Director.withColumn("ID",trim(col("ID")).cast(LongType))
.withColumn("index",trim(col("index")).cast(LongType))
.withColumn("MID",trim(col("MID")))
.withColumn("PID",trim(col("PID")))
.dropDuplicates("MID","PID")
.dropDuplicates("ID")
.dropDuplicates

m_directors = [index: bigint, MID: string ... 2 more fields]


[index: bigint, MID: string ... 2 more fields]

# display sample data, schema and count m_director variable

In [25]:
m_directors.printSchema
m_directors.show(2)
val m_d2Count=m_directors.count

root
 |-- index: long (nullable = true)
 |-- MID: string (nullable = true)
 |-- PID: string (nullable = true)
 |-- ID: long (nullable = true)

+-----+---------+---------+---+
|index|      MID|      PID| ID|
+-----+---------+---------+---+
|   26|tt5074352|nm4318159| 26|
|   29|tt1098327|nm0939128| 29|
+-----+---------+---------+---+
only showing top 2 rows



m_d2Count = 3475


3475

# Display Count difference between M_Director and m_director variables

In [26]:
val diff_m_directors_count=m_d1Count-m_d2Count
println(diff_m_directors_count+" d")

0 d


diff_m_directors_count = 0


0

# Loading and cleaning Genre data

In [27]:
val Genre=spark.read.option("header","true").csv("/home/use2cobadmin/practice_data/imdb/imdb_csv/Genre.csv")

Genre = [index: string, Name: string ... 1 more field]


[index: string, Name: string ... 1 more field]

In [28]:
Genre.show(5)
Genre.printSchema

+-----+--------------------+---+
|index|                Name|GID|
+-----+--------------------+---+
|    0|Adventure, Drama,...|  0|
|    1|Action, Comedy, C...|  1|
|    2|Action, Adventure...|  2|
|    3|Action, Adventure...|  3|
|    4|Drama, Horror, Th...|  4|
+-----+--------------------+---+
only showing top 5 rows

root
 |-- index: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- GID: string (nullable = true)



In [29]:
val genre=Genre.withColumn("GID",trim(col("GID")).cast(LongType))
.withColumn("index",trim(col("index")).cast(LongType))
.withColumn("name",trim(col("name")))
.dropDuplicates("name")

genre = [index: bigint, name: string ... 1 more field]


[index: bigint, name: string ... 1 more field]

In [30]:
genre.show(5)
genre.printSchema

+-----+--------------------+---+
|index|                name|GID|
+-----+--------------------+---+
|   12|    Action, Thriller| 12|
|   53|     Crime, Thriller| 53|
|   97|Action, Drama, Music| 97|
|  172|Family, Romance, ...|172|
|  284|Crime, Drama, Action|284|
+-----+--------------------+---+
only showing top 5 rows

root
 |-- index: long (nullable = true)
 |-- name: string (nullable = true)
 |-- GID: long (nullable = true)



In [31]:
val M_Genre=spark.read.option("header","true").csv("/home/use2cobadmin/practice_data/imdb/imdb_csv/M_Genre.csv")

M_Genre = [index: string, MID: string ... 2 more fields]


[index: string, MID: string ... 2 more fields]

In [32]:
M_Genre.show(5)
M_Genre.printSchema

+-----+---------+---+---+
|index|      MID|GID| ID|
+-----+---------+---+---+
|    0|tt2388771|  0|  0|
|    1|tt5164214|  1|  1|
|    2|tt1365519|  2|  2|
|    3|tt0848228|  3|  3|
|    4|tt8239946|  4|  4|
+-----+---------+---+---+
only showing top 5 rows

root
 |-- index: string (nullable = true)
 |-- MID: string (nullable = true)
 |-- GID: string (nullable = true)
 |-- ID: string (nullable = true)



In [33]:
val genre_with_mid=M_Genre.withColumn("GID",trim(col("GID")).cast(LongType))
.withColumn("index",trim(col("index")).cast(LongType))
.withColumn("MID",trim(col("MID")))
.dropDuplicates
.dropDuplicates("MID","GID")

genre_with_mid = [index: bigint, MID: string ... 2 more fields]


[index: bigint, MID: string ... 2 more fields]

In [34]:
genre_with_mid.show(5)
genre_with_mid.printSchema

+-----+---------+---+----+
|index|      MID|GID|  ID|
+-----+---------+---+----+
| 3447|tt0055035| 49|3447|
|  804|tt0067183|137| 804|
| 1983|tt0079236|185|1983|
| 2747|tt0138360| 16|2747|
| 3413|tt0156361|309|3413|
+-----+---------+---+----+
only showing top 5 rows

root
 |-- index: long (nullable = true)
 |-- MID: string (nullable = true)
 |-- GID: long (nullable = true)
 |-- ID: string (nullable = true)



In [35]:
val comedy_movie_ids=genre.filter(lower($"name").contains("comedy"))
                    .select("GID")
                    .dropDuplicates()

comedy_movie_ids = [GID: bigint]


[GID: bigint]

In [36]:
comedy_movie_ids.show(5)

+---+
|GID|
+---+
| 65|
|191|
|277|
|113|
| 77|
+---+
only showing top 5 rows



In [37]:
val comedy_mids=genre_with_mid.join(comedy_movie_ids,comedy_movie_ids("GID")===genre_with_mid("GID"),"inner")
.select("MID")

comedy_mids = [MID: string]


[MID: string]

In [38]:
val leap_year_mids=movies.select("MID","year").filter(isLeapYear(col("year"))).select($"MID".alias("LMID"))

leap_year_mids = [LMID: string]


[LMID: string]

In [39]:
val comedy_leap_year_mids=leap_year_mids.join(comedy_mids,comedy_mids("MID")===leap_year_mids("LMID"),"inner")
.select($"MID".alias("CLMID"))

comedy_leap_year_mids = [CLMID: string]


[CLMID: string]

In [40]:
comedy_leap_year_mids.show(5)

+---------+
|    CLMID|
+---------+
|tt0249588|
|tt0250452|
|tt0222270|
|tt0247992|
|tt0271748|
+---------+
only showing top 5 rows



In [41]:
val c_ly_d_pid=m_directors.join(comedy_leap_year_mids,comedy_leap_year_mids("CLMID")===col("MID"),"inner").select($"PID".alias("CLYPID"))

c_ly_d_pid = [CLYPID: string]


[CLYPID: string]

In [42]:
val final_c_ly_d=c_ly_d_pid.join(persons,persons("PID")===c_ly_d_pid("CLYPID")).select("PID","name","gender")

final_c_ly_d = [PID: string, name: string ... 1 more field]


[PID: string, name: string ... 1 more field]

In [43]:
final_c_ly_d.show(5)

+---------+---------------+------+
|      PID|           name|gender|
+---------+---------------+------+
|nm0787453|    Kundan Shah|  null|
|nm0997633|Shakeel Noorani|  null|
|nm0592583|     Aziz Mirza|  null|
|nm0223522|   David Dhawan|  null|
|nm0222427|    Anil Devgan|  null|
+---------+---------------+------+
only showing top 5 rows



# Validation 1

In [44]:
val pid="nm0787453"
val p=persons.filter($"PID"===pid)
p.show

+-----+---------+-----------+------+
|index|      PID|       Name|Gender|
+-----+---------+-----------+------+
|37137|nm0787453|Kundan Shah|  null|
+-----+---------+-----------+------+



pid = nm0787453
p = [index: int, PID: string ... 2 more fields]


[index: int, PID: string ... 2 more fields]

In [45]:
m_directors
.filter($"PID"===pid)
.dir_details.join(movies,"MID")
.join(genre_with_mid,"MID")
.join(genre,"GID")
.select($"name".alias("Genre_Name"),$"title",$"MID",$"PID",$"year",$"GID")
.show()

Name: Compile Error
Message: <console>:50: error: value dir_details is not a member of org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
possible cause: maybe a semicolon is missing before `value dir_details'?
       .dir_details.join(movies,"MID")
        ^

StackTrace: 

# Validation 2

# Validation 3