In [2]:
 import pandas as pd
 import numpy as np 
 # Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
"num_doors", "body_style", "drive_wheels", "engine_location",
"wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system",
"bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
"city_mpg", "highway_mpg", "price"]
 
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )

In [3]:
ob=df.select_dtypes(include=['object']).copy()
ob.head(2)

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi


In [4]:
ob.isnull().sum()

make               0
fuel_type          0
aspiration         0
num_doors          2
body_style         0
drive_wheels       0
engine_location    0
engine_type        0
num_cylinders      0
fuel_system        0
dtype: int64

In [5]:
ob['num_cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
twelve      1
three       1
Name: num_cylinders, dtype: int64

In [6]:
ob=ob.fillna({'num_doors':'four'})

In [7]:
clean_num={'num_doors':{"four":4,"two":2},
            'num_cylinders':{'four':4,'two':2,"twelve":12,"three":3,"five":5,"six":6,'eight':8}}
ob.replace(clean_num,inplace=True)

In [8]:
ob.dtypes

make               object
fuel_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
dtype: object

In [9]:
df['body_style']=df['body_style'].astype('category')
df.dtypes

symboling               int64
normalized_losses     float64
make                   object
fuel_type              object
aspiration             object
num_doors              object
body_style           category
drive_wheels           object
engine_location        object
wheel_base            float64
length                float64
width                 float64
height                float64
curb_weight             int64
engine_type            object
num_cylinders          object
engine_size             int64
fuel_system            object
bore                  float64
stroke                float64
compression_ratio     float64
horsepower            float64
peak_rpm              float64
city_mpg                int64
highway_mpg             int64
price                 float64
dtype: object

In [10]:
ob['body_style_cat']=df['body_style'].cat.codes
ob.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi,2
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi,3
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi,3


In [11]:
ob.dtypes

make               object
fuel_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
body_style_cat       int8
dtype: object

In [12]:
ob['body_style'].value_counts()


sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: body_style, dtype: int64

In [13]:
pd.get_dummies(ob,columns=['drive_wheels']).head(5)

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
1,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
2,alfa-romero,gas,std,2,hatchback,front,ohcv,6,mpfi,2,0,0,1
3,audi,gas,std,4,sedan,front,ohc,4,mpfi,3,0,1,0
4,audi,gas,std,4,sedan,front,ohc,5,mpfi,3,1,0,0


In [14]:
pd.get_dummies(ob,columns=['body_style','drive_wheels'],prefix=['body','drive']).head(5)

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,4,front,ohc,5,mpfi,3,0,0,0,1,0,1,0,0


In [15]:
ob['ohc_code']=np.where(ob['engine_type'].str.contains('ohc'),1,0)

In [16]:
ob.head(5)

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,ohc_code
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0,1
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0,1
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi,2,1
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi,3,1
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi,3,1


In [17]:
from sklearn.preprocessing import LabelEncoder
lb_make=LabelEncoder()
ob['make_code']=lb_make.fit_transform(ob['make'])
ob[['make','make_code']].head(20)


Unnamed: 0,make,make_code
0,alfa-romero,0
1,alfa-romero,0
2,alfa-romero,0
3,audi,1
4,audi,1
5,audi,1
6,audi,1
7,audi,1
8,audi,1
9,audi,1


In [18]:
from sklearn.preprocessing import LabelBinarizer
lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(ob['body_style'])
pd.DataFrame(lb_results, columns = lb_style.classes_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0


In [19]:
df1=pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/biopics/biopics.csv',encoding='latin1')

In [20]:
df1.shape

(761, 14)

In [21]:
df1.head()

Unnamed: 0,title,site,country,year_release,box_office,director,number_of_subjects,subject,type_of_subject,race_known,subject_race,person_of_color,subject_sex,lead_actor_actress
0,10 Rillington Place,http://www.imdb.com/title/tt0066730/,UK,1971,-,Richard Fleischer,1,John Christie,Criminal,Unknown,,0,Male,Richard Attenborough
1,12 Years a Slave,http://www.imdb.com/title/tt2024544/,US/UK,2013,$56.7M,Steve McQueen,1,Solomon Northup,Other,Known,African American,1,Male,Chiwetel Ejiofor
2,127 Hours,http://www.imdb.com/title/tt1542344/,US/UK,2010,$18.3M,Danny Boyle,1,Aron Ralston,Athlete,Unknown,,0,Male,James Franco
3,1987,http://www.imdb.com/title/tt2833074/,Canada,2014,-,Ricardo Trogi,1,Ricardo Trogi,Other,Known,White,0,Male,Jean-Carl Boucher
4,20 Dates,http://www.imdb.com/title/tt0138987/,US,1998,$537K,Myles Berkowitz,1,Myles Berkowitz,Other,Unknown,,0,Male,Myles Berkowitz


In [22]:
df1['country'].nunique()

7

In [23]:
df1['sub_race']=df1['subject_race'].astype('category').cat.codes

In [24]:
df1.sample(10)

Unnamed: 0,title,site,country,year_release,box_office,director,number_of_subjects,subject,type_of_subject,race_known,subject_race,person_of_color,subject_sex,lead_actor_actress,sub_race
551,The Color of Freedom,http://www.imdb.com/title/tt0438859/,UK,2007,-,Bille August,1,James Gregory,Other,Known,White,0,Male,Joseph Fiennes,16
233,Gorillas in the Mist,http://www.imdb.com/title/tt0095243/,US,1988,$24.7M,Michael Apted,1,Dian Fossey,Academic,Unknown,,0,Female,Sigourney Weaver,-1
130,Che!,http://www.imdb.com/title/tt0064158/,US,1969,$2M,Richard Fleischer,1,Che Guevara,Activist,Known,Hispanic (Latin American),0,Male,Omar Sharif,6
688,The Winning Team,http://www.imdb.com/title/tt0045332/,US,1952,-,Lewis Seiler,1,Grover Cleveland Alexander,Athlete,Known,White,0,Male,Ronald Reagan,16
201,Fear Strikes Out,http://www.imdb.com/title/tt0050383/,US,1957,-,Robert Mulligan,1,Jimmy Piersall,Athlete,Unknown,,0,Male,Anthony Perkins,-1
441,Quo Vadis,http://www.imdb.com/title/tt0043949/,US,1951,-,Mervyn LeRoy,2,Lygia,Historical,Known,White,0,Female,Deborah Kerr,16
514,Swimming Upstream,http://www.imdb.com/title/tt0326664/,US,2003,$47.2K,Russell Mulcahy,1,Tony Fingleton,Athlete,Unknown,,0,Male,Geoffrey Rush,-1
502,Star!,http://www.imdb.com/title/tt0063642/,US,1968,$4M,Robert Wise,1,Gertrude Lawrence,Actress,Known,White,0,Female,Julie Andrews,16
758,Young Tom Edison,http://www.imdb.com/title/tt0033289/,US,1940,-,Norman Taurog,1,Thomas Edison,Other,Known,White,0,Male,Mickey Rooney,16
71,Big Eyes,http://www.imdb.com/title/tt1126590/,US/Canada,2014,$14.5M,Tim Burton,1,Margaret Keane,Artist,Known,White,0,Female,Amy Adams,16


In [25]:
df1['subject_race'].value_counts()

White                        428
African American              35
Multi racial                  24
Hispanic (Latin American)     13
Middle Eastern (White)        13
Middle Eastern                11
African                        9
Hispanic (White)               6
Hispanic (Latino)              6
Asian                          5
Native American                4
Asian American                 3
Indian                         2
Caribbean                      2
Eurasian                       1
Mediterranean                  1
Hispanic (Latina)              1
Name: subject_race, dtype: int64

In [28]:
df1['country'].nunique()

7