In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import numpy as np
import re

In [2]:
# setting working directory
os.chdir("D:/MOOC/edwisor/assignment")

## 1. Load CSV by skipping second row.

In [3]:
# Loading data from csv while skipping the second row
imdb = pd.read_csv('IMDB_data.csv', skiprows=[2], header = 0, encoding = 'latin1')

imdb.describe()

Unnamed: 0,Plot,Title,imdbVotes,Poster,imdbRating,Genre,imdbID,Year,Language
count,3389,3389,3334,1836,3388,3389,3389,3389,3389
unique,3382,3383,904,1782,87,249,3383,89,4
top,A tormented young woman teams up with an eccen...,Kick,2718,http://i.media-imdb.com/images/SF1f0a42ee1aa08...,87,Drama,tt0461936,2014,English
freq,2,2,1702,52,1702,689,2,1031,2932


## Preparing data 

In [4]:
# Removing trailing whitespace
for i in imdb.columns:
    try:
        imdb[i] = imdb[i].astype(str).map(lambda x: x.strip())
    except:
        print(type(imdb[i]))
        
# converting all the nan's to to empty string (string 'nan' and np.nan both)
imdb.fillna('', inplace = True)
for i in imdb.columns:
    imdb[i].replace(to_replace = 'nan', value = '', inplace = True)
    
imdb.describe()

Unnamed: 0,Plot,Title,imdbVotes,Poster,imdbRating,Genre,imdbID,Year,Language
count,3389,3389,3389,3389.0,3389,3389,3389,3389,3389
unique,3382,3383,905,1783.0,88,249,3383,89,3
top,A tormented young woman teams up with an eccen...,Kick,2718,,87,Drama,tt0461936,2014,English
freq,2,2,1702,1553.0,1702,689,2,1031,2933


In [5]:
## removing junk characters(everything excepts digits) from variable
for i in range(0,len(imdb['imdbVotes'])):
    l = re.findall('\\d+', imdb['imdbVotes'][i])
    imdb['imdbVotes'][i] = ''.join(l)

for i in range(0,len(imdb['imdbRating'])):
    l = re.findall('\\d+', imdb['imdbRating'][i])
    imdb['imdbRating'][i] = ''.join(l)

imdb.describe()

Unnamed: 0,Plot,Title,imdbVotes,Poster,imdbRating,Genre,imdbID,Year,Language
count,3389,3389,3389,3389.0,3389,3389,3389,3389,3389
unique,3382,3383,895,1783.0,57,249,3383,89,3
top,A tormented young woman teams up with an eccen...,Kick,2718,,87,Drama,tt0461936,2014,English
freq,2,2,1702,1553.0,1703,689,2,1031,2933


In [6]:
## Replacing empty values with '0's
imdb['imdbVotes'] = imdb['imdbVotes'].replace('','0')
imdb['imdbRating'] = imdb['imdbRating'].replace('','0')

## dropping na's from data frame
imdb.dropna(axis = 0, inplace = True)

## 2. Extract the unique genres and its count and store in data frame with index key. 	 

In [7]:
# creating a new dataframe with unique Genre's and with Genre as index
uniq_genre = imdb.groupby(['Genre'])['Genre'].count()
uniq_genre

Genre
Action                                                                  41
Action, Adventure                                                        3
Action, Adventure, Biography, Drama, History, Musical, Romance           1
Action, Adventure, Comedy, Crime, Musical                                1
Action, Adventure, Comedy, Drama                                         1
Action, Adventure, Comedy, Drama, Fantasy, Musical, Romance, Sci-Fi      1
Action, Adventure, Comedy, Drama, Musical, Thriller                      1
Action, Adventure, Comedy, Horror                                        1
Action, Adventure, Comedy, Romance, Thriller                             1
Action, Adventure, Crime, Drama                                          1
Action, Adventure, Crime, Drama, Family                                  1
Action, Adventure, Crime, Drama, Thriller                                5
Action, Adventure, Drama                                                11
Action, Adventure, 

## 3. Converting required data type 

In [8]:
## converting to appropriate data types
imdb['imdbVotes'] = imdb['imdbVotes'].astype(float)
imdb['imdbRating'] = imdb['imdbRating'].astype(float)

In [9]:
## imdbRating should be 0 - 10.0
imdb['imdbRating'] = imdb['imdbRating'].apply(lambda x: x/10 if x > 10.0 else x)

# Will come empty, since all such values has been divided by 10
imdb[imdb['imdbRating'] > 10]

Unnamed: 0,Plot,Title,imdbVotes,Poster,imdbRating,Genre,imdbID,Year,Language


In [10]:
#(since we only have year, its better to take time as category rather than datetime)
# limiting year to 4 digits
imdb['Year'] = imdb['Year'].map(lambda x: x[:4])

# changing Language,Year, Genre to categorical variable    
imdb['Year'] = imdb['Year'].astype('category')
imdb['Language'] = imdb['Language'].astype('category')
imdb['Genre'] = imdb['Genre'].astype('category')


for column in imdb.columns:
    print('dtype = {}, column ={}'.format(imdb[column].dtype, column))

dtype = object, column =Plot
dtype = object, column =Title
dtype = float64, column =imdbVotes
dtype = object, column =Poster
dtype = float64, column =imdbRating
dtype = object, column =Genre
dtype = object, column =imdbID
dtype = category, column =Year
dtype = category, column =Language


## 4. Sorting the genre

In [11]:
# sorting the genre
imdb.sort_values('Genre', ascending = True, inplace = True)
imdb.reset_index(inplace = True, drop = True)
imdb

Unnamed: 0,Plot,Title,imdbVotes,Poster,imdbRating,Genre,imdbID,Year,Language
0,A secret software created by the American gove...,Lethal Commission,1084.0,http://ia.media-imdb.com/images/M/MV5BODUyNDI1...,6.4,Action,tt2319919,2012,English
1,"In this action flick, JUDE ST. CLERE discovers...",The Lackey,2672.0,http://ia.media-imdb.com/images/M/MV5BMjAwODIx...,7.1,Action,tt2197884,2012,English
2,Eddy (Jayaprakash Reddy) is a powerful mafia l...,Nijam,803.0,,6.5,Action,tt0368896,2003,Telugu
3,Co-Director Tom Logan discusses the making of ...,Feature Commentary with Director Tom Logan,1421.0,http://ia.media-imdb.com/images/M/MV5BMTUzNDQ4...,6.8,Action,tt3099258,2013,English
4,"In a nuclear plant disaster, five teenagers fi...",Lock Down,1887.0,http://ia.media-imdb.com/images/M/MV5BMTgwMDg1...,5.4,Action,tt2349962,2013,English
5,"When long time friend and informer, Louie D. G...",Bullitt and the Mystery of the Devils Root,1887.0,http://ia.media-imdb.com/images/M/MV5BNzg2NzM5...,5.4,Action,tt3020878,2013,English
6,An astrophysicist professor - who firmly belie...,Dimension Folders,2718.0,http://ia.media-imdb.com/images/M/MV5BMTQ5NzA0...,8.7,Action,tt2936218,2013,English
7,"A secret organization called ""The Society"" is ...",The Hunters Circle,2718.0,,8.7,Action,tt2989060,2013,English
8,An out of work good citizen and seasoned Karat...,Monrovia on Fire,2718.0,http://ia.media-imdb.com/images/M/MV5BMjAxNjcz...,8.7,Action,tt3745484,2014,English
9,This is the Quantum Age. An age that now has f...,QUANTUM SHOCK the Movie,2718.0,,8.7,Action,tt4089764,2014,English


## 5. Create new variable whose values should be square of difference between imdbrating and imdbvotes.

In [12]:
#creating new column as 
#    imdb_new_var = square of difference between imdbrating and imdbvotes

imdb['imdb_new_var'] = (imdb['imdbRating']-imdb['imdbVotes']) ** 2
imdb

Unnamed: 0,Plot,Title,imdbVotes,Poster,imdbRating,Genre,imdbID,Year,Language,imdb_new_var
0,A secret software created by the American gove...,Lethal Commission,1084.0,http://ia.media-imdb.com/images/M/MV5BODUyNDI1...,6.4,Action,tt2319919,2012,English,1161221.76
1,"In this action flick, JUDE ST. CLERE discovers...",The Lackey,2672.0,http://ia.media-imdb.com/images/M/MV5BMjAwODIx...,7.1,Action,tt2197884,2012,English,7101692.01
2,Eddy (Jayaprakash Reddy) is a powerful mafia l...,Nijam,803.0,,6.5,Action,tt0368896,2003,Telugu,634412.25
3,Co-Director Tom Logan discusses the making of ...,Feature Commentary with Director Tom Logan,1421.0,http://ia.media-imdb.com/images/M/MV5BMTUzNDQ4...,6.8,Action,tt3099258,2013,English,1999961.64
4,"In a nuclear plant disaster, five teenagers fi...",Lock Down,1887.0,http://ia.media-imdb.com/images/M/MV5BMTgwMDg1...,5.4,Action,tt2349962,2013,English,3540418.56
5,"When long time friend and informer, Louie D. G...",Bullitt and the Mystery of the Devils Root,1887.0,http://ia.media-imdb.com/images/M/MV5BNzg2NzM5...,5.4,Action,tt3020878,2013,English,3540418.56
6,An astrophysicist professor - who firmly belie...,Dimension Folders,2718.0,http://ia.media-imdb.com/images/M/MV5BMTQ5NzA0...,8.7,Action,tt2936218,2013,English,7340306.49
7,"A secret organization called ""The Society"" is ...",The Hunters Circle,2718.0,,8.7,Action,tt2989060,2013,English,7340306.49
8,An out of work good citizen and seasoned Karat...,Monrovia on Fire,2718.0,http://ia.media-imdb.com/images/M/MV5BMjAxNjcz...,8.7,Action,tt3745484,2014,English,7340306.49
9,This is the Quantum Age. An age that now has f...,QUANTUM SHOCK the Movie,2718.0,,8.7,Action,tt4089764,2014,English,7340306.49
