# Project 5

In [1]:
import pandas as pd
import altair as alt
import numpy as np

# "ISO-8859-1" will fix the problem with reading special characters of our data
sw_cols = pd.read_csv("StarWars.csv", encoding= "ISO-8859-1", header=None, nrows =2) # Select only the first 2 rows of our data with no header
sw_data = pd.read_csv("StarWars.csv", encoding= "ISO-8859-1", header=None, skiprows=2) # Skip the first 2 rows with no header


# Question 1

We will need to fix and clean up our data first:
1. We need specific names for our columns 

>>> Reading recommended for this week: Python for Data Science: Strings


In [2]:
# We need a Series to manipulate the columns names
# We can use .iloc() to pull the first row as a column
# To fill the NaN fills between question que can use forward fill .ffill()
# Replace spaces with .replace(" ", "_") or 
# Replace spaces with .replace("Have you seen any of the 6 films in the Star Wars franchise?", "seen_") 
question_cols = (sw_cols.iloc[0,:]
                    .ffill()
                    .replace("Which of the following Star Wars films have you seen? Please select all that apply.", "seen")  # 0 is the first row
                    .replace("Have you seen any of the 6 films in the Star Wars franchise?","seen_any")
                    .replace("Do you consider yourself to be a fan of the Star Wars film franchise?", "is_fan_sw")
                    .replace("Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.", "mrank")
                    .replace("Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.", "crank_")
                    .replace("Which character shot first?", "shot_first")
                    .replace("Are you familiar with the Expanded Universe?", "expanded_universe")
                    .replace("Do you consider yourself to be a fan of the Expanded Universe?\x8cæ", "fan_expanded_universe")
                    .replace("Do you consider yourself to be a fan of the Star Trek franchise?", "fan_star_trek")
                    .str.replace(" ","_")
                    .str.replace("\(|\)","")
                    .str.lower())


In [3]:
options_cols = (sw_cols.iloc[1,:]
                    .replace("Response","")
                    .str.replace("Star Wars: Episode", "")
                    .str.lower()
                    .str.replace(" ","_")
                    .fillna("")
                    )



In [4]:
join_cols_names = question_cols + options_cols


In [5]:
sw_data.columns = join_cols_names


In [6]:
sw_data.columns

Index(['respondentid', 'seen_any', 'is_fan_sw', 'seen_i__the_phantom_menace',
       'seen_ii__attack_of_the_clones', 'seen_iii__revenge_of_the_sith',
       'seen_iv__a_new_hope', 'seen_v_the_empire_strikes_back',
       'seen_vi_return_of_the_jedi', 'mrank_i__the_phantom_menace',
       'mrank_ii__attack_of_the_clones', 'mrank_iii__revenge_of_the_sith',
       'mrank_iv__a_new_hope', 'mrank_v_the_empire_strikes_back',
       'mrank_vi_return_of_the_jedi', 'crank_han_solo', 'crank_luke_skywalker',
       'crank_princess_leia_organa', 'crank_anakin_skywalker',
       'crank_obi_wan_kenobi', 'crank_emperor_palpatine', 'crank_darth_vader',
       'crank_lando_calrissian', 'crank_boba_fett', 'crank_c-3p0',
       'crank_r2_d2', 'crank_jar_jar_binks', 'crank_padme_amidala',
       'crank_yoda', 'shot_first', 'expanded_universe',
       'fan_expanded_universe', 'fan_star_trek', 'gender', 'age',
       'household_income', 'education', 'location_census_region'],
      dtype='object')

# Question 2


In [7]:
# Find the percentage of how many women and men have seen a film of stars wars
sw_data.value_counts(["gender","seen_any"], sort = False)

gender  seen_any
Female  No          152
        Yes         397
Male    No           74
        Yes         423
dtype: int64

In [8]:
print((sw_data.query("gender == 'Male'")
        .seen_any
        .value_counts(normalize=True)
        ).to_markdown())

|     |   seen_any |
|:----|-----------:|
| Yes |   0.851107 |
| No  |   0.148893 |


In [9]:
print((sw_data.query("gender == 'Female'")
        .seen_any
        .value_counts(normalize=True)        
        ).to_markdown())

|     |   seen_any |
|:----|-----------:|
| Yes |   0.723133 |
| No  |   0.276867 |


In [10]:
sw_data.filter(regex="^seen__").dropna(how="all").shape

(0, 0)

In [11]:
print(sw_data["shot_first"].dropna().shape)

shot = sw_data["shot_first"].dropna().value_counts(normalize=True).reset_index()
shot['percent'] = shot["shot_first"]
shot

(828,)


Unnamed: 0,index,shot_first,percent
0,Han,0.392512,0.392512
1,I don't understand this question,0.369565,0.369565
2,Greedo,0.237923,0.237923


In [12]:
base_chart1 = (alt.Chart(shot)
    .mark_bar()
    .encode(
        x = alt.X("shot_first", axis = None),
        y = alt.Y("index", sort=['Han',"Greedo","I don't understand this question"], title=None)
    )
    .properties(
        title = {
                "text" : ["Which Character Shot First?"],
                "subtitle" : ["According to 828 respondents"]}
    ))

chart_text1 = (alt.Chart(shot)
    .mark_text(
        align = 'left',
        dx = 3
    )
    .encode(
        x = alt.X("shot_first"),
        y = alt.Y("index", sort=['Han',"Greedo","I don't understand this question"]),
        text = alt.Text('percent' ,format='.0%')     
    
    ))

base_chart1 + chart_text1

In [13]:
seen_dummies = pd.get_dummies(sw_data.dropna(how="all", subset=['gender']).query("seen_any == 'Yes'").filter(['seen_i__the_phantom_menace',
       'seen_ii__attack_of_the_clones', 'seen_iii__revenge_of_the_sith',
       'seen_iv__a_new_hope', 'seen_v_the_empire_strikes_back',
       'seen_vi_return_of_the_jedi']))

seen_dummies.columns = ['The Phantom Menace',
       'Attack of the Clones', 'Revenge of the Sith',
       'A New Hope', 'The Empire Strikes Back',
       'Return of the Jedi']

seen_dummies = seen_dummies.sum().reset_index().rename(columns={0:"counts"}).assign(percent = lambda x: x.counts / 820)


In [14]:
sw_data.query("seen_any == 'Yes'").shape

(936, 38)

In [15]:
base_chart2 = (alt.Chart(seen_dummies)
                .encode(x = alt.X("percent", axis = None),
                        y = alt.Y("index", title=None, sort=['The Phantom Menace',
       'Attack of the Clones', 'Revenge of the Sith',
       'A New Hope', 'The Empire Strikes Back',
       'Return of the Jedi'])
                        )
                .properties(
                    title = {
                        "text" : ["Which 'Star Wars' Movies Have You Seen?"],
                        "subtitle": ["Of 820 respondents who have seen any film and declared a gender"]
                    }
                )
                .mark_bar(color='green')
)

chart_text2 = (alt.Chart(seen_dummies)
    .mark_text(
        align = 'left',
        dx = 3
    )
    .encode(
        x = alt.X("percent"),
        y = alt.Y("index", sort=['The Phantom Menace',
       'Attack of the Clones', 'Revenge of the Sith',
       'A New Hope', 'The Empire Strikes Back',
       'Return of the Jedi']),
        text = alt.Text('percent' ,format='.0%')     
    
    ))

base_chart2 + chart_text2

In [16]:
pd.get_dummies(sw_data["shot_first"])

Unnamed: 0,Greedo,Han,I don't understand this question
0,0,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,0
...,...,...,...
1181,0,1,0
1182,0,0,1
1183,0,0,0
1184,0,1,0


In [17]:
pd.get_dummies(sw_data["is_fan_sw"], drop_first = True)

Unnamed: 0,Yes
0,1
1,0
2,0
3,1
4,1
...,...
1181,1
1182,1
1183,0
1184,1


# Question 3

In [18]:
# Part A
q3 = sw_data.query('seen_any == "Yes"')
q3

Unnamed: 0,respondentid,seen_any,is_fan_sw,seen_i__the_phantom_menace,seen_ii__attack_of_the_clones,seen_iii__revenge_of_the_sith,seen_iv__a_new_hope,seen_v_the_empire_strikes_back,seen_vi_return_of_the_jedi,mrank_i__the_phantom_menace,...,crank_yoda,shot_first,expanded_universe,fan_expanded_universe,fan_star_trek,gender,age,household_income,education,location_census_region
0,3292879998,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
2,3292765271,Yes,No,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,,,,1.0,...,Unfamiliar (N/A),I don't understand this question,No,,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
3,3292763116,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Very favorably,I don't understand this question,No,,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
4,3292731220,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Somewhat favorably,Greedo,Yes,No,No,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
5,3292719380,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,1.0,...,Very favorably,Han,Yes,No,Yes,Male,18-29,"$25,000 - $49,999",Bachelor degree,Middle Atlantic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,3288389603,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3.0,...,Very favorably,Han,No,,No,Female,45-60,"$0 - $24,999",Some college or Associate degree,Pacific
1181,3288388730,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,5.0,...,Very favorably,Han,No,,Yes,Female,18-29,"$0 - $24,999",Some college or Associate degree,East North Central
1182,3288378779,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,4.0,...,Very favorably,I don't understand this question,No,,Yes,Female,30-44,"$50,000 - $99,999",Bachelor degree,Mountain
1184,3288373068,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,4.0,...,Very favorably,Han,No,,Yes,Female,45-60,"$100,000 - $149,999",Some college or Associate degree,East North Central


In [19]:
# Part B
ml_age = (q3.age
    .str.replace("> ", "")
    .str.split("-", expand=True)
    .rename(columns= {0:"age_min",1:"age_max"})
    .age_min
    .astype("float")
    )

ml_age.tail()

1180    45.0
1181    18.0
1182    30.0
1184    45.0
1185    60.0
Name: age_min, dtype: float64

In [20]:
# Part C
ml_school = (q3.education.
        str.replace('Less than high school degree', '9').
        str.replace('High school degree', '12').
        str.replace('Some college or Associate degree', '14').
        str.replace('Bachelor degree', '16').
        str.replace('Graduate degree', '20').
        astype('float'))

In [21]:
# Part D
ml_income =(q3.household_income
                .str.replace("\$|,|\+","")
                .str.split(" - ", expand = True)
                .rename(columns= {0:"income_min",1:"income_max"})
                .income_min
                .astype("float"))

ml_income

0            NaN
2            0.0
3       100000.0
4       100000.0
5        25000.0
          ...   
1180         0.0
1181         0.0
1182     50000.0
1184    100000.0
1185     50000.0
Name: income_min, Length: 936, dtype: float64

In [22]:
categories_numeric = pd.concat([ml_age, ml_school, ml_income], axis=1)

print(categories_numeric.tail().to_markdown())

|      |   age_min |   education |   income_min |
|-----:|----------:|------------:|-------------:|
| 1180 |        45 |          14 |            0 |
| 1181 |        18 |          14 |            0 |
| 1182 |        30 |          16 |        50000 |
| 1184 |        45 |          14 |       100000 |
| 1185 |        60 |          20 |        50000 |


In [23]:
# Part E
ml_dummies = pd.get_dummies(q3.filter(['is_fan_sw', 'seen_i__the_phantom_menace',
       'seen_ii__attack_of_the_clones', 'seen_iii__revenge_of_the_sith',
       'seen_iv__a_new_hope', 'seen_v_the_empire_strikes_back',
       'seen_vi_return_of_the_jedi', 'crank_han_solo', 'crank_luke_skywalker',
       'crank_princess_leia_organa', 'crank_anakin_skywalker',
       'crank_obi_wan_kenobi', 'crank_emperor_palpatine', 'crank_darth_vader',
       'crank_lando_calrissian', 'crank_boba_fett', 'crank_c-3p0',
       'crank_r2_d2', 'crank_jar_jar_binks', 'crank_padme_amidala',
       'crank_yoda', 'shot_first', 'expanded_universe',
       'fan_expanded_universe', 'fan_star_trek', 'gender', 'location_census_region']), drop_first=True)

print(ml_dummies.iloc[: , :5].tail().to_markdown())

|      |   is_fan_sw_Yes |   crank_han_solo_Somewhat favorably |   crank_han_solo_Somewhat unfavorably |   crank_han_solo_Unfamiliar (N/A) |   crank_han_solo_Very favorably |
|-----:|----------------:|------------------------------------:|--------------------------------------:|----------------------------------:|--------------------------------:|
| 1180 |               1 |                                   0 |                                     0 |                                 0 |                               1 |
| 1181 |               1 |                                   0 |                                     0 |                                 0 |                               1 |
| 1182 |               1 |                                   0 |                                     0 |                                 0 |                               1 |
| 1184 |               1 |                                   0 |                                     0 |                     

In [24]:
starwars_ml = pd.concat([ml_dummies, 
                         q3.filter(['mrank_i__the_phantom_menace', 'mrank_ii__attack_of_the_clones',
       'mrank_iii__revenge_of_the_sith', 'mrank_iv__a_new_hope',
       'mrank_v_the_empire_strikes_back', 'mrank_vi_return_of_the_jedi']),
                         ml_age, 
                         ml_school, 
                         ml_income], axis=1)


In [25]:

starwars_ml.head()


Unnamed: 0,is_fan_sw_Yes,crank_han_solo_Somewhat favorably,crank_han_solo_Somewhat unfavorably,crank_han_solo_Unfamiliar (N/A),crank_han_solo_Very favorably,crank_han_solo_Very unfavorably,crank_luke_skywalker_Somewhat favorably,crank_luke_skywalker_Somewhat unfavorably,crank_luke_skywalker_Unfamiliar (N/A),crank_luke_skywalker_Very favorably,...,location_census_region_West South Central,mrank_i__the_phantom_menace,mrank_ii__attack_of_the_clones,mrank_iii__revenge_of_the_sith,mrank_iv__a_new_hope,mrank_v_the_empire_strikes_back,mrank_vi_return_of_the_jedi,age_min,education,income_min
0,1,0,0,0,1,0,0,0,0,1,...,0,3.0,2.0,1.0,4.0,5.0,6.0,18.0,12.0,
2,0,1,0,0,0,0,1,0,0,0,...,0,1.0,2.0,3.0,4.0,5.0,6.0,18.0,12.0,0.0
3,1,0,0,0,1,0,0,0,0,1,...,0,5.0,6.0,1.0,2.0,4.0,3.0,18.0,14.0,100000.0
4,1,0,0,0,1,0,1,0,0,0,...,0,5.0,4.0,6.0,2.0,1.0,3.0,18.0,14.0,100000.0
5,1,0,0,0,1,0,0,0,0,1,...,0,1.0,4.0,3.0,6.0,5.0,2.0,18.0,16.0,25000.0


In [26]:
starwars_ml = starwars_ml.dropna()

In [27]:

#
features = starwars_ml.drop(['income_min'], axis=1) # aka, "features"
targets = (starwars_ml.income_min >= 50000)*1 # aka, "target"


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [29]:
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size = .34, random_state = 1)

In [30]:
# create a classification model
classifier_DT = DecisionTreeClassifier(max_depth = 7, random_state=1)

# train the model
classifier_DT.fit(x_train, y_train)

# use your model to make predictions!
y_predicted = classifier_DT.predict(x_test)

# test how accurate those predictions are
metrics.accuracy_score(y_test, y_predicted)

0.62882096069869

In [31]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.49      0.39      0.44        84
           1       0.69      0.77      0.72       145

    accuracy                           0.63       229
   macro avg       0.59      0.58      0.58       229
weighted avg       0.61      0.63      0.62       229



In [48]:
feature_df = pd.DataFrame({'features':features.columns, 'importance':classifier_DT.feature_importances_})
print(feature_df.sort_values('importance', ascending = False).head(10).to_markdown())


|    | features                               |   importance |
|---:|:---------------------------------------|-------------:|
| 92 | education                              |    0.0848805 |
| 90 | mrank_vi_return_of_the_jedi            |    0.0816397 |
| 91 | age_min                                |    0.0766213 |
| 86 | mrank_ii__attack_of_the_clones         |    0.0585088 |
| 58 | crank_jar_jar_binks_Unfamiliar (N/A)   |    0.0502947 |
| 51 | crank_r2_d2_Somewhat favorably         |    0.0478215 |
| 88 | mrank_iv__a_new_hope                   |    0.0445382 |
| 64 | crank_padme_amidala_Very favorably     |    0.0435831 |
| 82 | location_census_region_South Atlantic  |    0.0350944 |
| 61 | crank_padme_amidala_Somewhat favorably |    0.0332416 |


In [43]:
char = (alt.Chart(feature_df.sort_values('importance', ascending = False).head(10))
    .mark_bar()
    .encode(
        x= 'importance:Q',
        y= alt.Y('features:N', sort='-x'),
        
        )
    .properties( title="Feature Importance from DT Model")
)

char

## Question 1
Shorten the column names and clean them up for easier use with pandas.
