### Railways FRA Data Analysis 
#### (DS 6001 - Final Pipeline)

In [1]:
import sqlite3
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
conn = sqlite3.connect("fra.db")

In [3]:
df = pd.read_sql('SELECT * FROM inc', conn, index_col='index')

In [4]:
df.head()

Unnamed: 0_level_0,IYR,IMO,RAILROAD,INCDTNO,IYR2,IMO2,RR2,INCDTNO2,IYR3,IMO3,...,NARR15,RCL,Latitude,Longitud,SIGNAL,MOPERA,ADJUNCT1,ADJUNCT2,ADJUNCT3,SUBDIV
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,75,12,WP,2122875,,,,,75,12,...,,,,,,,,,,
1,75,12,WP,2122575,,,,,75,12,...,,,,,,,,,,
2,75,12,WP,2122975,,,,,75,12,...,,,,,,,,,,
3,75,12,WP,2121875,,,,,75,12,...,,,,,,,,,,
4,75,12,WP,2121175,,,,,75,12,...,,,,,,,,,,


In [5]:
df.shape

(204830, 145)

### Data Cleaning
### 1) Transform Variable Encoding

#### Replacing numeric values of TYPE with their descriptive names.

In [6]:
# converting TYPE column to categorical
df['TYPE'] = df['TYPE'].astype('category')
# assigning new categories
df['TYPE'].cat.categories = ['Derailment', 'Head on collision', 'Rear end collision', 'Side collision', 'Raking collision', 'Broken train collision', 'Hwy-rail crossing', 'RR grade crossing', 'Obstruction', 'Explosion-detonation', 'Fire/violent rupture', 'Other impacts', 'Other']
df.TYPE.head()

index
0    Side collision
1        Derailment
2       Obstruction
3        Derailment
4        Derailment
Name: TYPE, dtype: category
Categories (13, object): [Derailment, Head on collision, Rear end collision, Side collision, ..., Explosion-detonation, Fire/violent rupture, Other impacts, Other]

#### Replacing numeric values of TYPEQ with their descriptive names.

In [7]:
df['TYPEQ'] = df['TYPEQ'].fillna('0')
df['TYPEQ'] = df['TYPEQ'].apply(lambda x: x[0])
# converting to type categorical
df['TYPEQ'] = df['TYPEQ'].astype('category')
df['TYPEQ'].cat.categories = ['Missing', 'Freight Train', 'Passenger Train-Pulling', 'Commuter Train-Pulling', 'Work train', 'Single Car', 'Cut of cars', 'Yard/switching', 'Light loco(s)', 'Maint./inspect. Car', 'Spec. MoW Equip.', 'Passenger Train-Pushing', 'Commuter Train-Pushing', 'EMU', 'DMU']
df.TYPEQ.head()

index
0    Yard/switching
1     Freight Train
2    Yard/switching
3     Freight Train
4     Freight Train
Name: TYPEQ, dtype: category
Categories (15, object): [Missing, Freight Train, Passenger Train-Pulling, Commuter Train-Pulling, ..., Passenger Train-Pushing, Commuter Train-Pushing, EMU, DMU]

#### Replacing CAUSE with first letter of code.

In [8]:
df['CAUSE'] = df['CAUSE'].apply(lambda x: x[0])
df.CAUSE.head()

index
0    H
1    M
2    H
3    M
4    T
Name: CAUSE, dtype: object

### 2) Imputing missing values in TYPEQ. 

In [9]:
# converting missing category back to NaN
df.loc[df['TYPEQ'] == 'Missing', 'TYPEQ'] = np.nan

# imputing missing values using mode since TYPEQ is categorical variable
df['TYPEQ'] = df['TYPEQ'].fillna(df['TYPEQ'].mode()[0])

### 3) Joining narratives

In [10]:
# get all columns with NARR
narrative_columns = []
for i in range(1, 16):
    narrative_columns.append('NARR' + str(i))
    
# replace with ''
df[narrative_columns] = df[narrative_columns].fillna('')
  
# concatenate all NARR columns 
df['Narrative'] = ''
for column in narrative_columns:
    df['Narrative'] = df['Narrative'] + df[column]
    
# Dropping the older  
df = df.drop(columns = narrative_columns)

In [11]:
df.head()

Unnamed: 0_level_0,IYR,IMO,RAILROAD,INCDTNO,IYR2,IMO2,RR2,INCDTNO2,IYR3,IMO3,...,RCL,Latitude,Longitud,SIGNAL,MOPERA,ADJUNCT1,ADJUNCT2,ADJUNCT3,SUBDIV,Narrative
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,75,12,WP,2122875,,,,,75,12,...,,,,,,,,,,
1,75,12,WP,2122575,,,,,75,12,...,,,,,,,,,,
2,75,12,WP,2122975,,,,,75,12,...,,,,,,,,,,
3,75,12,WP,2121875,,,,,75,12,...,,,,,,,,,,
4,75,12,WP,2121175,,,,,75,12,...,,,,,,,,,,


### 4) Dropping duplicates.

In [12]:
#Dropping duplicates based on JOINTCD
df = df[df['JOINTCD'] == 1]
df = df.drop_duplicates(keep='first')
df.shape

(166599, 131)

### 5) Correcting for time value of money.

#### Downloaded the updated CPI file from https://data.bls.gov/timeseries/CUUR0000SA0 into a csv called 'cpi.csv' which contains calculated inflation rate 

In [13]:
df_cpi = pd.read_csv('cpi.csv')
df_cpi.month = df_cpi.month
df_cpi.year = df_cpi.year
df_cpi['inflation'] = 252.885/df_cpi['cpi']
df_cpi.head()

Unnamed: 0,month,year,cpi,inflation
0,1,1975,52.1,4.853839
1,1,1976,55.6,4.548291
2,1,1977,58.5,4.322821
3,1,1978,62.5,4.04616
4,1,1979,68.3,3.702562


In [14]:
#Renaming columns
df_cpi.columns = ['MONTH', 'YEAR4', 'cpi', 'inflation']

In [15]:
merged_df = pd.merge(left = df, right = df_cpi, how = 'inner', left_on=['MONTH', 'YEAR4'], right_on = ['MONTH', 'YEAR4'])

In [16]:
merged_df.head()

Unnamed: 0,IYR,IMO,RAILROAD,INCDTNO,IYR2,IMO2,RR2,INCDTNO2,IYR3,IMO3,...,Longitud,SIGNAL,MOPERA,ADJUNCT1,ADJUNCT2,ADJUNCT3,SUBDIV,Narrative,cpi,inflation
0,75,12,WP,2122875,,,,,75,12,...,,,,,,,,,55.5,4.556486
1,75,12,WP,2122575,,,,,75,12,...,,,,,,,,,55.5,4.556486
2,75,12,WP,2122975,,,,,75,12,...,,,,,,,,,55.5,4.556486
3,75,12,WP,2121875,,,,,75,12,...,,,,,,,,,55.5,4.556486
4,75,12,WP,2121175,,,,,75,12,...,,,,,,,,,55.5,4.556486


In [17]:
merged_df['Adj_ACCDMG'] = merged_df['ACCDMG'] * merged_df['inflation']
merged_df[['ACCDMG', 'Adj_ACCDMG']].tail()

Unnamed: 0,ACCDMG,Adj_ACCDMG
166594,34403,34580.356197
166595,109000,109561.922667
166596,59462,59768.541703
166597,14674,14749.648195
166598,43203,43425.722431


### Store in a table

In [18]:
merged_df.to_sql('df_clean', conn, if_exists='replace', index=True)

In [19]:
conn.close()