# 7. A decade is a sequence of 10 consecutive years. For example, say in your database you have movie information starting from 1965. Then the first decade is 1965, 1966, ..., 1974; the second one is 1967, 1968, ..., 1976 and so on. Find the decade D with the largest number of films and the total number of films in D.

In [1]:
import org.apache.spark.sql.functions._

In [2]:
import org.apache.spark.sql.types._

In [3]:
import org.apache.spark.sql.expressions.Window

In [4]:
val spark=SparkSession.builder().appName("movies app").master("local[*]").getOrCreate()

spark = org.apache.spark.sql.SparkSession@e67b747


org.apache.spark.sql.SparkSession@e67b747

In [5]:
import spark.implicits._

# step1: Data Ingestion to Spark

# required data file is Movie.csv

In [10]:
val Movie=spark.read.option("header","true").csv("/home/use2cobadmin/practice_data/imdb/imdb_csv/Movie.csv")

Movie = [index: string, MID: string ... 4 more fields]


[index: string, MID: string ... 4 more fields]

# All the UDF needed for this task will be here

# 001. This udf is to fetch the year value from given string, it will return zero if there is no 4 digit year value

In [9]:
val getYear=udf((year:String)=>{
    val p="\\d{4}".r
    val yr=p.findFirstMatchIn(year).getOrElse("").toString
    if(yr!=""){
        yr.toInt
    }
    else{
        0
    }
})

getYear = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))


UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))

# printing table structure to know what are the columns and it's data type
# displaying total records in the table

In [12]:
Movie.printSchema
Movie.show(5)
Movie.count

root
 |-- index: string (nullable = true)
 |-- MID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- num_votes: string (nullable = true)

+-----+---------+-------------+----+------+---------+
|index|      MID|        title|year|rating|num_votes|
+-----+---------+-------------+----+------+---------+
|    0|tt2388771|       Mowgli|2018|   6.6|    21967|
|    1|tt5164214|Ocean's Eight|2018|   6.2|   110861|
|    2|tt1365519|  Tomb Raider|2018|   6.4|   142585|
|    3|tt0848228| The Avengers|2012|   8.1|  1137529|
|    4|tt8239946|      Tumbbad|2018|   8.5|     7483|
+-----+---------+-------------+----+------+---------+
only showing top 5 rows



3475

# we will do data clean since it doesn't have proper data
# after seeing sample data (above)
# observation 1: year,rating,num_votes,index columns data types should be Integer
# observation 2: also we need do trim to remove any extra spaces

In [13]:
val movies=Movie.withColumn("year",getYear(Movie("year")).cast(IntegerType))
.withColumn("title",trim(Movie("title")))
.withColumn("MID",trim(Movie("MID")))
.withColumn("rating",trim(Movie("rating")).cast(DoubleType))
.withColumn("num_votes",trim(Movie("num_votes")).cast(LongType))
.withColumn("index",trim(Movie("index")).cast(IntegerType))
.dropDuplicates("MID")

movies = [index: int, MID: string ... 4 more fields]


[index: int, MID: string ... 4 more fields]

# now print the schema to verify columns and it's data type

In [16]:
movies.printSchema
movies.show(5)

root
 |-- index: integer (nullable = true)
 |-- MID: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = false)
 |-- rating: double (nullable = true)
 |-- num_votes: long (nullable = true)

+-----+---------+--------------------+----+------+---------+
|index|      MID|               title|year|rating|num_votes|
+-----+---------+--------------------+----+------+---------+
| 3008|tt0036077|              Kismet|1943|   7.5|       71|
| 2969|tt0102701|Prahaar: The Fina...|1991|   7.9|     1696|
| 1670|tt0268216|              Charas|1976|   6.6|      136|
| 2215|tt0292606|      Nazar Ke Samne|1995|   6.0|      430|
| 1336|tt0298607|Yeh Zindagi Ka Safar|2001|   3.3|      122|
+-----+---------+--------------------+----+------+---------+
only showing top 5 rows



# if count is 0 then ok.. otherwise .. data cleaning need to do

movies.filter($"year"===0).count()

# get the start year form movies to find the decades

In [48]:
val startYear:Int=movies.sort("year").take(1)(0)(3).toString.toInt

startYear = 1931


1931

In [43]:
movies.sort("year").take(1)

Array([1709,tt0021594,Alam Ara,1931,7.4,97])

In [47]:
movies.sort("year").take(1)(0)(3)

1931

# get the end year form movies to find the decades

In [49]:
val endYear:Int=movies.sort(desc("year")).take(1)(0)(3).toString.toInt

endYear = 2018


2018

# now calculate the decades

In [50]:
val decades=startYear+9 to endYear by 9

decades = Range(1940, 1949, 1958, 1967, 1976, 1985, 1994, 2003, 2012)


Range(1940, 1949, 1958, 1967, 1976, 1985, 1994, 2003, 2012)

# importing break control from scala.util.control.Breaks.break

In [51]:
import scala.util.control.Breaks._

# UDF to get Decade for given year

In [53]:
val getDecade=udf((year:Int)=>{
    var d=1
    var yr=year
    breakable {
    for(y<-decades){
        if(yr<=y){
            break
        }
        d+=1
    }
    }
   
    d
   
})

getDecade = UserDefinedFunction(<function1>,IntegerType,Some(List(IntegerType)))


UserDefinedFunction(<function1>,IntegerType,Some(List(IntegerType)))

# call the getDecade function to get Decade for each row

In [58]:
val m_d_ids=movies.withColumn("Decade",getDecade(movies("year")))

m_d_ids = [index: int, MID: string ... 5 more fields]


lastException: Throwable = null


[index: int, MID: string ... 5 more fields]

In [59]:
m_d_ids.groupBy("Decade").count().sort(desc("count")).show()

+------+-----+
|Decade|count|
+------+-----+
|     9| 1011|
|    10|  740|
|     8|  619|
|     7|  413|
|     6|  277|
|     5|  217|
|     4|  115|
|     3|   65|
|     2|   12|
|     1|    6|
+------+-----+

