In [78]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import webbrowser
from tempfile import NamedTemporaryFile

In [2]:
# Loading data
starwars = pd.read_csv(
    'StarWars.csv',
    sep = ',',
    decimal = '.',
    header = None,
    skiprows = 2,
    names = [
        'Respondent_Id',
        'Watched_movie',
        'Movie_fan',
        'Watch_episode_1',
        'Watch_episode_2',
        'Watch_episode_3',
        'Watch_episode_4',
        'Watch_episode_5',
        'Watch_episode_6',
        'rank_episode_1',
        'rank_episode_2',
        'rank_episode_3',
        'rank_episode_4',
        'rank_episode_5',
        'rank_episode_6',
        'han_solo',
        'luke_skywalker',
        'leia_organa',
        'anakin_skywalker',
        'obi_wan_kanobi',
        'emperor_palpatine',
        'darth_vader',
        'lando_calrissian',
        'boba_fett',
        'c3po',
        'r2d2',
        'jar_jar_binks',
        'padme_amidala',
        'yoda',
        'who_shot_first',
        'Expanded_universe_familiarity',
        'Expanded_universe_fan',
        'Star_trek_fan',
        'gender',
        'age',
        'income',
        'education',
        'Location_census_region'
    ]);

In [32]:
# Check datatypes
starwars.dtypes

Respondent_Id                      int64
Watched_movie                     object
Movie_fan                         object
Watch_episode_1                   object
Watch_episode_2                   object
Watch_episode_3                   object
Watch_episode_4                   object
Watch_episode_5                   object
Watch_episode_6                   object
rank_episode_1                   float64
rank_episode_2                   float64
rank_episode_3                   float64
rank_episode_4                   float64
rank_episode_5                   float64
rank_episode_6                   float64
han_solo                          object
luke_skywalker                    object
leia_organa                       object
anakin_skywalker                  object
obi_wan_kanobi                    object
emperor_palpatine                 object
darth_vader                       object
lando_calrissian                  object
boba_fett                         object
c3po            

In [3]:
# Remove whitespaces
white_space_cols = [
        'Watched_movie',
        'Movie_fan',
        'Watch_episode_1',
        'Watch_episode_2',
        'Watch_episode_3',
        'Watch_episode_4',
        'Watch_episode_5',
        'Watch_episode_6',
        'han_solo',
        'luke_skywalker',
        'leia_organa',
        'anakin_skywalker',
        'obi_wan_kanobi',
        'emperor_palpatine',
        'darth_vader',
        'lando_calrissian',
        'boba_fett',
        'c3po',
        'r2d2',
        'jar_jar_binks',
        'padme_amidala',
        'yoda',
        'who_shot_first',
        'Expanded_universe_familiarity',
        'Expanded_universe_fan',
        'Star_trek_fan',
        'gender',
        'age',
        'income',
        'education',
        'Location_census_region'] 
for column in white_space_cols:
    print("Before str.strip : " , starwars[column].unique())
    starwars[column] = starwars[column].str.strip()
    print("After str.strip  : " , starwars[column].unique())    

('Before str.strip : ', array(['Yes', 'No', 'Yes '], dtype=object))
('After str.strip  : ', array(['Yes', 'No'], dtype=object))
('Before str.strip : ', array(['Yes', nan, 'No', 'Yess', 'Noo'], dtype=object))
('After str.strip  : ', array(['Yes', nan, 'No', 'Yess', 'Noo'], dtype=object))
('Before str.strip : ', array(['Star Wars: Episode I  The Phantom Menace', nan], dtype=object))
('After str.strip  : ', array(['Star Wars: Episode I  The Phantom Menace', nan], dtype=object))
('Before str.strip : ', array(['Star Wars: Episode II  Attack of the Clones', nan], dtype=object))
('After str.strip  : ', array(['Star Wars: Episode II  Attack of the Clones', nan], dtype=object))
('Before str.strip : ', array(['Star Wars: Episode III  Revenge of the Sith', nan], dtype=object))
('After str.strip  : ', array(['Star Wars: Episode III  Revenge of the Sith', nan], dtype=object))
('Before str.strip : ', array(['Star Wars: Episode IV  A New Hope', nan], dtype=object))
('After str.strip  : ', array(['Sta

In [4]:
# Convert to uppercase
for column in white_space_cols:
    starwars[column] = starwars[column].str.upper()
starwars.head()

Unnamed: 0,Respondent_Id,Watched_movie,Movie_fan,Watch_episode_1,Watch_episode_2,Watch_episode_3,Watch_episode_4,Watch_episode_5,Watch_episode_6,rank_episode_1,...,yoda,who_shot_first,Expanded_universe_familiarity,Expanded_universe_fan,Star_trek_fan,gender,age,income,education,Location_census_region
0,3292879998,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,3.0,...,VERY FAVORABLY,I DON'T UNDERSTAND THIS QUESTION,YES,NO,NO,MALE,18-29,,HIGH SCHOOL DEGREE,SOUTH ATLANTIC
1,3292879538,NO,,,,,,,,,...,,,,,YES,MALE,18-29,"$0 - $24,999",BACHELOR DEGREE,WEST SOUTH CENTRAL
2,3292765271,YES,NO,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,,,,1.0,...,UNFAMILIAR (N/A),I DON'T UNDERSTAND THIS QUESTION,NO,,NO,MALE,18-29,"$0 - $24,999",HIGH SCHOOL DEGREE,WEST NORTH CENTRAL
3,3292763116,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,5.0,...,VERY FAVORABLY,I DON'T UNDERSTAND THIS QUESTION,NO,,YES,MALE,18-29,"$100,000 - $149,999",SOME COLLEGE OR ASSOCIATE DEGREE,WEST NORTH CENTRAL
4,3292731220,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,5.0,...,SOMEWHAT FAVORABLY,GREEDO,YES,NO,NO,MALE,18-29,"$100,000 - $149,999",SOME COLLEGE OR ASSOCIATE DEGREE,WEST NORTH CENTRAL


In [5]:
# Correct typos
for column in starwars.columns:
    print(starwars[column].unique())

# There are typos present in 'movie_fan','fan_expanded_universe','fan_star_trek', 'gender'

# Corrceting movie_fan
starwars['Movie_fan'].replace(['YESS', 'NOO'], ['YES', 'NO'], inplace=True)

# Corrceting fan_expanded_universe
starwars['Expanded_universe_fan'].replace('YESS', 'YES', inplace=True)

# Corrceting fan_star_trek
starwars['Star_trek_fan'].replace('NOO', 'NO', inplace=True)

# Corrceting gender
starwars['gender'].replace('F', 'FEMALE', inplace=True)

print('\n\nCorrected values  : \n')
for column in starwars.columns:
    print(starwars[column].unique())

[3292879998 3292879538 3292765271 ..., 3288375286 3288373068 3288372923]
['YES' 'NO']
['YES' nan 'NO' 'YESS' 'NOO']
['STAR WARS: EPISODE I  THE PHANTOM MENACE' nan]
['STAR WARS: EPISODE II  ATTACK OF THE CLONES' nan]
['STAR WARS: EPISODE III  REVENGE OF THE SITH' nan]
['STAR WARS: EPISODE IV  A NEW HOPE' nan]
['STAR WARS: EPISODE V THE EMPIRE STRIKES BACK' nan]
['STAR WARS: EPISODE VI RETURN OF THE JEDI' nan]
[  3.  nan   1.   5.   6.   4.   2.]
[  2.  nan   6.   4.   5.   1.   3.]
[  1.  nan   3.   6.   4.   5.   2.]
[  4.  nan   2.   6.   3.   1.   5.]
[  5.  nan   4.   1.   2.   3.   6.]
[  6.  nan   3.   2.   1.   4.   5.]
['VERY FAVORABLY' nan 'SOMEWHAT FAVORABLY'
 'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)' 'SOMEWHAT UNFAVORABLY'
 'UNFAMILIAR (N/A)' 'VERY UNFAVORABLY']
['VERY FAVORABLY' nan 'SOMEWHAT FAVORABLY' 'SOMEWHAT UNFAVORABLY'
 'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)' 'VERY UNFAVORABLY'
 'UNFAMILIAR (N/A)']
['VERY FAVORABLY' nan 'SOMEWHAT FAVORABLY' 'SOMEWHAT UNFAVO

In [6]:
# Sanity check
# From above inspection, only age column has an impossible value '500'

starwars['age'].replace('500', '> 60', inplace=True)
starwars['age'].unique()


array(['18-29', nan, '> 60', '30-44', '45-60'], dtype=object)

In [16]:
# Handling missing values

# First we check for the total missing values
starwars.isnull().sum()

# We will replace NAs in Movie_fan and Watch_episode 1 to 6 by 'NO' since no information is available
list_range_1 = ['Movie_fan', 'Watch_episode_1', 'Watch_episode_2', 'Watch_episode_3',
                'Watch_episode_4', 'Watch_episode_5', 'Watch_episode_6']
for i in list_range_1:
    starwars[i].fillna('NO', inplace = True)

# We will replace NAs in rank_episode 1 to 6 by mode
list_range_2 = ['rank_episode_1', 'rank_episode_2', 'rank_episode_3',
                'rank_episode_4', 'rank_episode_5', 'rank_episode_6',]
for i in list_range_2:
    starwars[i].fillna(starwars[i].mode()[0], inplace = True)
    
# We will replace NAs for character and who_shot_first columns by 'NOT ANSWERED'
list_range_3 = ['han_solo', 'luke_skywalker','leia_organa', 'anakin_skywalker',
                'obi_wan_kanobi', 'emperor_palpatine', 'darth_vader',
                'lando_calrissian', 'boba_fett', 'c3po', 'r2d2', 'jar_jar_binks',
                'padme_amidala', 'yoda', 'who_shot_first']
for i in list_range_3:
    starwars[i].fillna('NOT ANSWERED', inplace = True)
    
# We will replace NAs for 'Expanded_universe_familiarity','Expanded_universe_fan','Star_trek_fan' by NOT ANSWERED   
list_range_4 = ['Expanded_universe_familiarity','Expanded_universe_fan','Star_trek_fan']
for i in list_range_4:
    starwars[i].fillna('NOT ANSWERED', inplace = True)
    
    
starwars.isnull().sum()
starwars.head()

Unnamed: 0,Respondent_Id,Watched_movie,Movie_fan,Watch_episode_1,Watch_episode_2,Watch_episode_3,Watch_episode_4,Watch_episode_5,Watch_episode_6,rank_episode_1,...,yoda,who_shot_first,Expanded_universe_familiarity,Expanded_universe_fan,Star_trek_fan,gender,age,income,education,Location_census_region
0,3292879998,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,3.0,...,VERY FAVORABLY,I DON'T UNDERSTAND THIS QUESTION,YES,NO,NO,MALE,18-29,,HIGH SCHOOL DEGREE,SOUTH ATLANTIC
1,3292879538,NO,NO,NO,NO,NO,NO,NO,NO,4.0,...,NOT ANSWERED,NOT ANSWERED,NOT ANSWERED,NOT ANSWERED,YES,MALE,18-29,"$0 - $24,999",BACHELOR DEGREE,WEST SOUTH CENTRAL
2,3292765271,YES,NO,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,NO,NO,NO,1.0,...,UNFAMILIAR (N/A),I DON'T UNDERSTAND THIS QUESTION,NO,NOT ANSWERED,NO,MALE,18-29,"$0 - $24,999",HIGH SCHOOL DEGREE,WEST NORTH CENTRAL
3,3292763116,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,5.0,...,VERY FAVORABLY,I DON'T UNDERSTAND THIS QUESTION,NO,NOT ANSWERED,YES,MALE,18-29,"$100,000 - $149,999",SOME COLLEGE OR ASSOCIATE DEGREE,WEST NORTH CENTRAL
4,3292731220,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,5.0,...,SOMEWHAT FAVORABLY,GREEDO,YES,NO,NO,MALE,18-29,"$100,000 - $149,999",SOME COLLEGE OR ASSOCIATE DEGREE,WEST NORTH CENTRAL


In [15]:
# Task 2.1 Data exploration

#Exploring the survey question of ranking the star wars movies in order of most favourite film and least favourite. 
#Then analysing how people rate those movies.


print("\nRanking for Episode 1 : ")
print(starwars['rank_episode_1'].value_counts())

print("\nRanking for Episode 2 : ")
print(starwars['rank_episode_2'].value_counts())

print("\nRanking for Episode 3 : ")
print(starwars['rank_episode_3'].value_counts())

print("\nRanking for Episode 4 : ")
print(starwars['rank_episode_4'].value_counts())

print("\nRanking for Episode 5 : ")
print(starwars['rank_episode_5'].value_counts())

print("\nRanking for Episode 6 : ")
print(starwars['rank_episode_6'].value_counts())




Ranking for Episode 1 : 
4.0    237
6.0    168
3.0    130
1.0    129
5.0    100
2.0     71
Name: rank_episode_1, dtype: int64

Ranking for Episode 2 : 
5.0    300
4.0    183
2.0    116
3.0    103
6.0    102
1.0     32
Name: rank_episode_2, dtype: int64

Ranking for Episode 3 : 
6.0    217
5.0    203
4.0    182
3.0    150
2.0     47
1.0     36
Name: rank_episode_3, dtype: int64

Ranking for Episode 4 : 
1.0    204
6.0    161
2.0    135
4.0    130
3.0    127
5.0     79
Name: rank_episode_4, dtype: int64

Ranking for Episode 5 : 
1.0    289
2.0    235
5.0    118
3.0    106
4.0     47
6.0     41
Name: rank_episode_5, dtype: int64

Ranking for Episode 6 : 
2.0    232
3.0    220
1.0    146
6.0    145
4.0     57
5.0     36
Name: rank_episode_6, dtype: int64
   Respondent_Id Watched_movie Movie_fan  \
0     3292879998           YES       YES   
1     3292879538            NO       NaN   
2     3292765271           YES        NO   
3     3292763116           YES       YES   
4     3292731220  

In [17]:
# Analysis : 
# After analysis we can see that most poeple have selected Episode 5 as their most favourite movie
# followed by episode 4. Whereas most people have voted Episode 3 as their least favourite movie
# followed by episode 2. 

Unnamed: 0,Respondent_Id,Watched_movie,Movie_fan,Watch_episode_1,Watch_episode_2,Watch_episode_3,Watch_episode_4,Watch_episode_5,Watch_episode_6,rank_episode_1,...,yoda,who_shot_first,Expanded_universe_familiarity,Expanded_universe_fan,Star_trek_fan,gender,age,income,education,Location_census_region
0,3292879998,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,3.0,...,VERY FAVORABLY,I DON'T UNDERSTAND THIS QUESTION,YES,NO,NO,MALE,18-29,,HIGH SCHOOL DEGREE,SOUTH ATLANTIC
1,3292879538,NO,NO,NO,NO,NO,NO,NO,NO,4.0,...,NOT ANSWERED,NOT ANSWERED,NOT ANSWERED,NOT ANSWERED,YES,MALE,18-29,"$0 - $24,999",BACHELOR DEGREE,WEST SOUTH CENTRAL
2,3292765271,YES,NO,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,NO,NO,NO,1.0,...,UNFAMILIAR (N/A),I DON'T UNDERSTAND THIS QUESTION,NO,NOT ANSWERED,NO,MALE,18-29,"$0 - $24,999",HIGH SCHOOL DEGREE,WEST NORTH CENTRAL
3,3292763116,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,5.0,...,VERY FAVORABLY,I DON'T UNDERSTAND THIS QUESTION,NO,NOT ANSWERED,YES,MALE,18-29,"$100,000 - $149,999",SOME COLLEGE OR ASSOCIATE DEGREE,WEST NORTH CENTRAL
4,3292731220,YES,YES,STAR WARS: EPISODE I THE PHANTOM MENACE,STAR WARS: EPISODE II ATTACK OF THE CLONES,STAR WARS: EPISODE III REVENGE OF THE SITH,STAR WARS: EPISODE IV A NEW HOPE,STAR WARS: EPISODE V THE EMPIRE STRIKES BACK,STAR WARS: EPISODE VI RETURN OF THE JEDI,5.0,...,SOMEWHAT FAVORABLY,GREEDO,YES,NO,NO,MALE,18-29,"$100,000 - $149,999",SOME COLLEGE OR ASSOCIATE DEGREE,WEST NORTH CENTRAL


In [8]:
list_movie1 = []
for i in range(1,7):
    list_movie1.append(starwars[
        (starwars['Watch_episode_1'] == "STAR WARS: EPISODE I  THE PHANTOM MENACE")
        &
        (starwars['rank_episode_1'] == i)
    ]['rank_episode_1'].count())
   
list_movie2 = []
for i in range(1,7):
    list_movie2.append(starwars[
        (starwars['Watch_episode_2'] == "STAR WARS: EPISODE II  ATTACK OF THE CLONES")
        &
        (starwars['rank_episode_2'] == i)
    ]['rank_episode_2'].count())


list_movie3 = []
for i in range(1,7):
    list_movie3.append(starwars[
        (starwars['Watch_episode_3'] == "STAR WARS: EPISODE III  REVENGE OF THE SITH")
        &
        (starwars['rank_episode_3'] == i)
    ]['rank_episode_3'].count())

list_movie4 = []
for i in range(1,7):
    list_movie4.append(starwars[
        (starwars['Watch_episode_4'] == "STAR WARS: EPISODE IV  A NEW HOPE")
        &
        (starwars['rank_episode_4'] == i)
    ]['rank_episode_4'].count())

list_movie5 = []
for i in range(1,7):
    list_movie5.append(starwars[
        (starwars['Watch_episode_5'] == "STAR WARS: EPISODE V THE EMPIRE STRIKES BACK")
        &
        (starwars['rank_episode_5'] == i)
    ]['rank_episode_5'].count())

list_movie6 = []
for i in range(1,7):
    list_movie6.append(starwars[
        (starwars['Watch_episode_6'] == "STAR WARS: EPISODE VI RETURN OF THE JEDI")
        &
        (starwars['rank_episode_6'] == i)
    ]['rank_episode_6'].count())



rank_list = [0, 1, 2, 3, 4, 5]
rank_df = pd.DataFrame(list(zip(rank_list, list_movie1, list_movie2, list_movie3, list_movie4, list_movie5, list_movie6)),
                       columns = ["Ranks","Movie 1","Movie 2","Movie 3","Movie 4","Movie 5","Movie 6"]).set_index('Ranks', drop=True)


rank_df
#for ind in rank_df.index:
fig, ax = plt.subplots(1,1)
fig.set_size_inches(5,5)
rank_df.iloc[0].plot(kind='pie', ax=ax, autopct='%1.1f%%')
ax.set_ylabel('')
ax.set_xlabel('')
plt.show()


In [9]:
#Task 2.2:

#Relationship between 3 pairs of columns and their visualisation

#First Pair:

#Plausible hypothesis: 2.2.a
#"Being a fan of movie franchise can give higher ranking to original triology movie- Epi V: The Empire Strikes Back."



# Movie fan and episode 5 ranking

list_episode_5_yes = []
for i in range(1,7):
    list_episode_5_yes.append(starwars[
        (starwars['Movie_fan'] == 'YES')
        &
        (starwars['rank_episode_5'] == i)
        ]['rank_episode_5'].count()
    )

list_episode_5_no = []
for i in range(1,7):
    list_episode_5_no.append(starwars[
        (starwars['Movie_fan'] == 'NO')
        &
        (starwars['rank_episode_5'] == i)
        ]['rank_episode_5'].count()
    )


width = 0.35 
ind = np.arange(6)
fig, ax = plt.subplots()
ax.bar(ind, list_episode_5_yes, width, bottom=0,color='purple', label='Fan')
ax.bar(ind + width, list_episode_5_no, width, bottom=0,color='grey', label='Not a Fan')
ax.set_xticklabels(('Rank1', 'Rank2', 'Rank3', 'Rank4', 'Rank5', 'Rank6'))
ax.set_xticks(ind + width / 2)
ax.legend()
ax.set_title('Ranking Episode 5 by Fans')
ax.autoscale_view()
plt.show()


[196, 172, 80, 32, 52, 20]
[93, 63, 26, 15, 66, 21]


In [42]:
#Addressing plausible hypothesis: 

#As observed in above graph it contradicts our plausible hypothesis-2.2.a. 
#As people who were not a fan of movie franchise has still given rank 1 (highest ranking) to Original triology movie 5.

In [10]:

#Second Pair:

#Plausible hypothesis: 2.2.b
#"Being the least favourite movie of new triology, episode 3 will be ranked least favourite by movie franchise fans"


# Movie fan and episode 3 ranking

list_episode_3_yes = []
for i in range(1,7):
    list_episode_3_yes.append(starwars[
        (starwars['Movie_fan'] == 'YES')
        &
        (starwars['rank_episode_3'] == i)
        ]['rank_episode_3'].count()
    )
print(list_episode_3_yes)
    
list_episode_3_no = []
for i in range(1,7):
    list_episode_3_no.append(starwars[
        (starwars['Movie_fan'] == 'NO')
        &
        (starwars['rank_episode_3'] == i)
        ]['rank_episode_3'].count()
    )

print(list_episode_3_no)


width = 0.35 
ind = np.arange(6)
fig, ax = plt.subplots()
ax.bar(ind, list_episode_3_yes, width, bottom=0,color='navy', label='Fan')
ax.bar(ind + width, list_episode_3_no, width, bottom=0,color='teal', label='Not a Fan')
ax.set_xticklabels(('Rank1', 'Rank2', 'Rank3', 'Rank4', 'Rank5', 'Rank6'))
ax.set_xticks(ind + width / 2)
ax.legend()
ax.set_title('Ranking Episode 3 by Fans')
ax.autoscale_view()
plt.show()



[25, 29, 80, 132, 127, 158]
[11, 18, 70, 50, 76, 59]


In [44]:
#Addressing plausible hypothesis 2.2.b:

#As observed in above bar chart rank 6 outnumbers all the other rankings for episode 3. 
#This movie has gained rank 6 by more number of fans and non-fans of the movie franchise. 
#However this graph illustrates that the rank 6 is given by more number of not a fans of the franchise than 
#the fan of movie franchise. 


In [19]:
#Third Pair:

#Plausible hypothesis: 2.2.c:
#"The people who have actually watched the movie will give the highest number of ranking wheather it is any type of rank -
#rank 1 or rank 6."




# Bar chart of people who have seen and not seen Episode 5 and ranked it as '1'

list_episode_5_rank_yes = []
list_episode_5_rank_no = []


for i in range(1,7):
    list_episode_5_rank_yes.append(starwars[
        (starwars['Watch_episode_5'] == 'STAR WARS: EPISODE V THE EMPIRE STRIKES BACK')
        &
        (starwars['rank_episode_5'] == i)
        ]['rank_episode_5'].count()
    )


for i in range(1,7):
    list_episode_5_rank_no.append(starwars[
        (starwars['Watch_episode_5'] == 'NO')
        &
        (starwars['rank_episode_5'] == i)
        ]['rank_episode_5'].count()
    )    
    

width = 0.35 
ind = np.arange(6)
fig, ax = plt.subplots()
ax.bar(ind, list_episode_5_rank_yes, width, bottom=0,color='yellow', label='Watched')
ax.bar(ind + width, list_episode_5_rank_no, width, bottom=0,color='navy', label='Did Not Watch')
ax.set_xticklabels(('Rank1', 'Rank2', 'Rank3', 'Rank4', 'Rank5', 'Rank6'))
ax.set_xticks(ind + width / 2)
ax.legend()
ax.set_title('Ranking Episode 5 by Who watched it')
ax.autoscale_view()
plt.show()



In [17]:
#Addressing plausible hypothesis 2.2.c:

#As observed in above bar chart, episode 5 is given highest ranking of 1 by both the categories - (People who have watched the movie and who did not watch the movie.)
#However, episode 5 is ranked 1 by 353 people who did not watched the movie which outnumbers
#the 286 people who ranked the movie as rank 1 after watching it. 
#For other ranks it is evident that ranking is given more by the people who have actually watched it.

In [79]:
# Task 2.3


#Explore a specific relationship: Here we are exploring and analysing the relationship between the characters and the demographics
#of the people.



possible_response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY','NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)',
                     'SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY',
                     'UNFAMILIAR (N/A)','NOT ANSWERED']



list_han_solo = []
for response in possible_response:
    list_han_solo.append(starwars[
        (starwars['han_solo'] == response)
    ]['han_solo'].count())


list_luke_skywalker = []
for response in possible_response:
    list_luke_skywalker.append(starwars[
        (starwars['luke_skywalker'] == response)
    ]['luke_skywalker'].count())



list_leia_organa = []
for response in possible_response:
    list_leia_organa.append(starwars[
        (starwars['leia_organa'] == response)
    ]['leia_organa'].count())



list_anakin_skywalker = []
for response in possible_response:
    list_anakin_skywalker.append(starwars[
        (starwars['anakin_skywalker'] == response)
    ]['anakin_skywalker'].count())



list_obi_wan_kanobi = []
for response in possible_response:
    list_obi_wan_kanobi.append(starwars[
        (starwars['obi_wan_kanobi'] == response)
    ]['obi_wan_kanobi'].count())



list_emperor_palpatine = []
for response in possible_response:
    list_emperor_palpatine.append(starwars[
        (starwars['emperor_palpatine'] == response)
    ]['emperor_palpatine'].count())



list_darth_vader = []
for response in possible_response:
    list_darth_vader.append(starwars[
        (starwars['darth_vader'] == response)
    ]['darth_vader'].count())



list_lando_calrissian = []
for response in possible_response:
    list_lando_calrissian.append(starwars[
        (starwars['lando_calrissian'] == response)
    ]['lando_calrissian'].count())



list_boba_fett = []
for response in possible_response:
    list_boba_fett.append(starwars[
        (starwars['boba_fett'] == response)
    ]['boba_fett'].count())



list_c3po = []
for response in possible_response:
    list_c3po.append(starwars[
        (starwars['c3po'] == response)
    ]['c3po'].count())



list_r2d2 = []
for response in possible_response:
    list_r2d2.append(starwars[
        (starwars['r2d2'] == response)
    ]['r2d2'].count())


list_jar_jar_binks = []
for response in possible_response:
    list_jar_jar_binks.append(starwars[
        (starwars['jar_jar_binks'] == response)
    ]['jar_jar_binks'].count())



list_padme_amidala = []
for response in possible_response:
    list_padme_amidala.append(starwars[
        (starwars['padme_amidala'] == response)
    ]['padme_amidala'].count())


list_yoda = []
for response in possible_response:
    list_yoda.append(starwars[
        (starwars['yoda'] == response)
    ]['yoda'].count())


response_list = ['1st', '2nd', '3rd', '4th', '5th', '6th']
response_df = pd.DataFrame(list(zip(possible_response, list_han_solo, list_luke_skywalker, list_leia_organa,
                                list_anakin_skywalker, list_obi_wan_kanobi, list_emperor_palpatine,
                                list_darth_vader, list_lando_calrissian, list_boba_fett,
                                list_c3po, list_r2d2, list_jar_jar_binks, list_padme_amidala, list_yoda)),
                       columns = ['Response','han_solo','luke_skywalker','leia_organa','anakin_skywalker',
                                  'obi_wan_kanobi','emperor_palpatine','darth_vader','lando_calrissian',
                                  'boba_fett','c3po','r2d2','jar_jar_binks','padme_amidala','yoda']).set_index('Response', drop=True)
# plt.show(response_df)

def df_window(df):
    with NamedTemporaryFile(delete=False, suffix='.html') as f:
        df.to_html(f)
    webbrowser.open(f.name)
    
df_window(response_df)

In [44]:
# Data exploration task 2.3

character_names = ['han_solo','luke_skywalker','leia_organa','anakin_skywalker',
                 'obi_wan_kanobi','emperor_palpatine','darth_vader','lando_calrissian',
                 'boba_fett','c3po','r2d2','jar_jar_binks','padme_amidala','yoda']

possible_response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY','NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)', 
                     'SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY', 'UNFAMILIAR (N/A)','NOT ANSWERED']



In [96]:
#Explore characters with demographic Gender:
for column in character_names:
    grouped_data = starwars[['gender', column]].groupby([column, 'gender'])
    response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY',
               'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)','SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY',
               'NOT ANSWERED', 'UNFAMILIAR (N/A)']
    reordered = grouped_data.size().reindex(response, level=0)
    reordered.unstack().plot.bar(ylim=(0,325),title= column + ' : Character Rating by Gender')
    plt.xticks(rotation=10)

plt.show()    

In [97]:
#Explore characters with demographic Age:
for column in character_names:
    grouped_data = starwars[['age', column]].groupby([column, 'age'])
    response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY',
               'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)','SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY',
               'NOT ANSWERED', 'UNFAMILIAR (N/A)']
    reordered = grp.size().reindex(response, level=0)
    reordered.unstack().plot.bar(ylim=(0,200), title= column + ' : Character Rating classified by Age')
    plt.xticks(rotation=10)
    
plt.show() 

In [98]:
#Explore characters with demographic Income:
for column in character_names:
    grouped_data = starwars[['income', column]].groupby([column, 'income'])
    response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY',
               'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)','SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY',
               'NOT ANSWERED', 'UNFAMILIAR (N/A)']
    reordered = grouped_data.size().reindex(response, level=0)
    reordered.unstack().plot.bar(ylim=(0,200), title= column + ' : Character Rating classified by Income for ')
    plt.xticks(rotation=10)
plt.show() 

In [99]:
#Explore characters with demographic Education:
for column in character_names:
    grouped_data = starwars[['education', column]].groupby([column, 'education'])
    response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY',
               'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)','SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY',
               'NOT ANSWERED', 'UNFAMILIAR (N/A)']
    reordered = grouped_data.size().reindex(response, level=0)
    reordered.unstack().plot.bar(ylim=(0,200), title=column + ' : Character Rating classified by Education')
    plt.xticks(rotation=10)
plt.show() 

In [100]:
#Explore characters with demographic Location:
for column in character_names:
    grouped_data = starwars[['Location_census_region', column]].groupby([column, 'Location_census_region'])
    response = ['VERY FAVORABLY','SOMEWHAT FAVORABLY',
               'NEITHER FAVORABLY NOR UNFAVORABLY (NEUTRAL)','SOMEWHAT UNFAVORABLY','VERY UNFAVORABLY',
               'NOT ANSWERED', 'UNFAMILIAR (N/A)']
    reordered = grouped_data.size().reindex(response, level=0)
    reordered.unstack().plot.bar(ylim=(0,105), title=column + ' : Character Rating classified by Census Region')
    plt.xticks(rotation=10)
plt.show() 