# Data Cleaning

In [15]:
import pandas as pd
import numpy as np
%matplotlib inline


In [44]:
#Load Dataset
gundf = pd.read_csv('guns.csv')

In [45]:
gundf.shape

(100798, 11)

In [46]:
gundf.dtypes

Unnamed: 0      int64
year            int64
month           int64
intent         object
police          int64
sex            object
age           float64
race           object
hispanic        int64
place          object
education     float64
dtype: object

In [47]:
#check missing data
gundf.apply(lambda x: sum(x.isnull()))


Unnamed: 0       0
year             0
month            0
intent           1
police           0
sex              0
age             18
race             0
hispanic         0
place         1384
education       53
dtype: int64

In [9]:
object_var = ['intent', 'sex', 'race', 'place']
for col in object_var:
    print('\nUnique frequency count for column {}'.format(col))
    print(gundf[col].value_counts())


Unique frequency count for column intent
Suicide         63175
Homicide        35176
Accidental       1639
Undetermined      807
Name: intent, dtype: int64

Unique frequency count for column sex
M    86349
F    14449
Name: sex, dtype: int64

Unique frequency count for column race
White                             66237
Black                             23296
Hispanic                           9022
Asian/Pacific Islander             1326
Native American/Native Alaskan      917
Name: race, dtype: int64

Unique frequency count for column place
Home                       60486
Other specified            13751
Street                     11151
Other unspecified           8867
Trade/service area          3439
School/instiution            671
Farm                         470
Industrial/construction      248
Residential institution      203
Sports                       128
Name: place, dtype: int64


# HANDLING EACH COLUMN

In [48]:
len(gundf['Unnamed: 0'].unique())
#Doesn't add any value to the dataset, just indexing each row.Therefore we drop it.
gundf.drop('Unnamed: 0', axis=1, inplace=True)

In [49]:
gundf.head(3)

Unnamed: 0,year,month,intent,police,sex,age,race,hispanic,place,education
0,2012,1,Suicide,0,M,34.0,Asian/Pacific Islander,100,Home,4.0
1,2012,1,Suicide,0,F,21.0,White,100,Street,3.0
2,2012,1,Suicide,0,M,60.0,White,100,Other specified,4.0


# Education

In [50]:
#fill the missing 53 values with the median value
gundf['education'].fillna(gundf['education'].median(),inplace=True)


# Age

In [51]:
#fill the missing 18 values with the median value
gundf['age'].fillna(gundf['age'].median(),inplace=True)


# Place

In [52]:
#Drop rows where nan exist in 'place' column
gundf.dropna(subset=['place'], inplace=True)
gundf.shape

(99414, 10)

# Final Data

In [33]:
gundf.apply(lambda x: sum(x.isnull()))


year         0
month        0
intent       0
police       0
sex          0
age          0
race         0
hispanic     0
place        0
education    0
dtype: int64

# Encoding objects

In [54]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
columns = ['intent', 'sex', 'race', 'place']
for col in columns:
    gundf[col] = le.fit_transform(gundf[col])

# Write to csv file

In [55]:
gundf.to_csv('gun_clean.csv',index=False)