# Accident Locations on Indian Roads

## Steps involved:
1. Collection of Data and Web Scraping
2. Preprocessing the data for all the states
3. Writing them into individual .csv files for separate shape file visualization
4. Running GeoCoding scripts to fetch the coordinates for black spots
5. Filtering outliers
6. Exporting the final CSV file for visualization in QGIS


## Data Manipulation Scripts (eg: Maharashtra)

In [1]:
#importing required libraries for data manipulation
import pandas as pd
import numpy as np
import requests

In [2]:
#creating a dataframe on the dataset recieved from MoRTH
df=pd.read_csv('maha_black_spots.csv')
df.head(5)

Unnamed: 0,S.\nNo.,Name of the District,Juridiction/ Police\nStation,Location of accidents\nincluding chainage (km,NH No.,No. of\nfatalities,Reasons for frequent\naccidents
0,1,Thane/Manor,,Kude to Sativali,8,46,Chowk Zig Zag Road
1,2,Thane/Ghoti,,Old kasara Ghat,3,46,S Curve and Steep Slope
2,3,Satara/Karad,,Malkapur,4,40,Steep Slope
3,4,Pune/Khandala,,Waksai,4,36,Narrow Road
4,5,Jalgaon/Paldhi,,Serve Town Chowk,6,36,Road Junction


In [4]:
#checking if data attributes are iterable by their data types
df.dtypes

S.\nNo.                                            int64
Name of the District                              object
Juridiction/ Police\nStation                     float64
Location of accidents\nincluding chainage (km     object
NH No.                                             int64
No. of\nfatalities                                 int64
Reasons for frequent\naccidents                   object
dtype: object

In [5]:
#checking for null values
df.isnull().sum()
#since the prime attribute 'Location of Accidents' does not contain any null values we can proceed safely

S.\nNo.                                           0
Name of the District                              0
Juridiction/ Police\nStation                     25
Location of accidents\nincluding chainage (km     0
NH No.                                            0
No. of\nfatalities                                0
Reasons for frequent\naccidents                   0
dtype: int64

In [16]:
#defining a fucntion to convert attribute string values to viable query names for urls
def get_query_name(name_attribute):
    query_array=[]
    for place in name_attribute:
        substring_1=""
        flag=0
        for i in place:
            if(flag==0):
                #inserting %20 for space in name and removing escape sequences
                if i!='/' and i!='\\': 
                    substring_1=substring_1+i
                if i=='/':
                    substring_1=substring_1+'%20'
                if i=='\\':
                    flag=1
                    continue
            flag=0
        query_array.append(substring_1)
    return query_array

In [18]:
#defining two arrays just in case one fails to yield geocoded location

query_array=get_query_name(df['Location of accidents\nincluding chainage (km'])
substitute_array=get_query_name(df['Name of the District'])
display(substitute_array)

['Thane%20Manor',
 'Thane%20Ghoti',
 'Satara%20Karad',
 'Pune%20Khandala',
 'Jalgaon%20Paldhi',
 'Jalgaon%20Paldhi',
 'Thane%20Manor',
 'Pune%20Wadgaon',
 'Satara%20Karad',
 'Thane%20Shahapur',
 'Thane%20Charoti',
 'Raigad%20Mahad',
 'Satara%20Karad',
 'Osmanabad%20Naldurga',
 'Jalgaon%20Paldhi',
 'Jalgaon%20Paldhi',
 'Thane%20Shahapur',
 'Pune%20Wadgaon',
 'Thane%20Charoti',
 'Pune%20Vadgaon',
 'Nagpur%20Ramtek',
 'Osmanabad%20Naldurga',
 'Kolhapur%20Ujalaiwadi',
 'Raigad%20Palaspe',
 'Buldhana%20Malkapur']

In [22]:
#temporary dataframe
temp=pd.DataFrame({
    'Exact Location':query_array,
    'District Name':substitute_array
})
temp.head(7)

Unnamed: 0,Exact Location,District Name
0,Kude to Sativali,Thane%20Manor
1,Old kasara Ghat,Thane%20Ghoti
2,Malkapur,Satara%20Karad
3,Waksai,Pune%20Khandala
4,Serve Town Chowk,Jalgaon%20Paldhi
5,Wadjai Nala,Jalgaon%20Paldhi
6,Saykhed Hawali,Thane%20Manor


## Geo-Coding Locations

In [29]:
#defining a function to get latitude and longitude from location name

def get_lat_lng(df, query_array, substitute_array, key):
    lat = []
    lng = []
    for name in df['Exact Location']:
        try:
            if name or df.loc[df[query_array] == name, df[substitute_array]].values[0]:
                response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + name + '&key=' + key)
                data = response.json()
                if data['results']:
                    print('Got Results!')
                    lat.append(data['results'][0]['geometry']['location']['lat'])
                    lng.append(data['results'][0]['geometry']['location']['lng'])
                else:
                    #using substitute array if exact location is not found
                    print('Did not get results for: '+name)
                    substitute_value = df.loc[df['Exact Location'] == name, 'District Name'].values[0]
                    print('Getting results for: '+substitute_value)
                    response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + substitute_value + '&key=' + key)
                    data = response.json()
                    print('Getting district name...')
                    if data['results']:
                        print('Got District Name!')
                        lat.append(data['results'][0]['geometry']['location']['lat'])
                        lng.append(data['results'][0]['geometry']['location']['lng'])
                    else:
                        #inserting null values if district is not found either for later manual insertion
                        print('Got Nothing.')
                        lat.append(None)
                        lng.append(None)
        except Exception as e:
            #exception handler
            print('Error occurred:', e)
            lat.append(None)
            lng.append(None)
            print('Proceeding to next iteration...')
            continue
    return lat, lng


In [30]:
#defining API key and arrays to fetch results into

key='AIzaSyCG2EJVnD6OqtU-7qRrHyR-ygXU2CpnEaA'

lat_array, lng_array = get_lat_lng(temp, query_array, substitute_array, key)

Did not get results for: Kude to Sativali
Getting results for: Thane%20Manor
Getting district name...
Got District Name!
Got Results!
Got Results!
Got Results!
Did not get results for: Serve Town Chowk
Getting results for: Jalgaon%20Paldhi
Getting district name...
Got District Name!
Got Results!
Got Results!
Got Results!
Got Results!
Got Results!
Got Results!
Did not get results for: Sukeli Phata
Getting results for: Raigad%20Mahad
Getting district name...
Got District Name!
Did not get results for: Masur Fata
Getting results for: Satara%20Karad
Getting district name...
Got District Name!
Got Results!
Got Results!
Got Results!
Got Results!
Got Results!
Got Results!
Got Results!
Got Results!
Did not get results for: Malup Shivar
Getting results for: Osmanabad%20Naldurga
Getting district name...
Got District Name!
Got Results!
Did not get results for: Jeetegaon
Getting results for: Raigad%20Palaspe
Getting district name...
Got District Name!
Got Results!


In [31]:
#checking if all coordinates are appended
if(len(lat_array)==len(lng_array)):
    if(len(lat_array)==len(temp['Exact Location'])):
        print('Succesfully got all Coordinates!')
    else:
        print('Some Coordinates left out.')

Succesfully got all Coordinates!


In [33]:
#converting arrays to series data
Latitude=pd.Series(lat_array,name='Latitude')
Longitude=pd.Series(lng_array,name='Longitude')
display(Latitude,Longitude)

0     19.218331
1     17.739872
2     20.884327
3     18.765001
4     21.024535
5     18.427468
6     21.143103
7     18.737200
8     17.481403
9     17.729236
10    15.964716
11    18.082021
12    17.277693
13    17.816668
14    17.587452
15    28.620178
16    19.628332
17    18.761137
18    18.226631
19    18.686028
20    22.307159
21    17.816668
22    18.606087
23    18.953508
24    20.960876
Name: Latitude, dtype: float64

0     72.978090
1     83.259002
2     76.202608
3     73.453831
4     75.454865
5     73.981253
6     76.704790
7     73.638960
8     74.086555
9     73.939656
10    74.003571
11    73.422441
12    74.184354
13    76.273979
14    75.393681
15    77.136285
16    73.422951
17    73.557205
18    76.369517
19    73.747714
20    73.181219
21    76.273979
22    73.822792
23    73.129060
24    76.184170
Name: Longitude, dtype: float64

In [51]:
#attaching the calculated Lat and Long array
df=pd.concat([df,Latitude,Longitude],axis=1)
df.head(5)

Unnamed: 0,S.\nNo.,Name of the District,Juridiction/ Police\nStation,Location of accidents\nincluding chainage (km,NH No.,No. of\nfatalities,Reasons for frequent\naccidents,Latitude,Longitude
0,1,Thane/Manor,,Kude to Sativali,8,46,Chowk Zig Zag Road,19.218331,72.97809
1,2,Thane/Ghoti,,Old kasara Ghat,3,46,S Curve and Steep Slope,17.739872,83.259002
2,3,Satara/Karad,,Malkapur,4,40,Steep Slope,20.884327,76.202608
3,4,Pune/Khandala,,Waksai,4,36,Narrow Road,18.765001,73.453831
4,5,Jalgaon/Paldhi,,Serve Town Chowk,6,36,Road Junction,21.024535,75.454865


## Outlier Detection

In [50]:
#defining state border to filter out outliers

latitude_bounds=['Northernmost point of Maharashtra,Sugat','Southernmost point of Maharashtra Curchirem']
longitude_bounds=['Eastern point of Maharashtra','Westernmost point of Maharashtra']
lat_values=[]
long_values=[]
for i in latitude_bounds:
    response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + i + '&key=' + key)
    data = response.json()
    lat_values.append(data['results'][0]['geometry']['location']['lat'])

for i in longitude_bounds:
    response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + i + '&key=' + key)
    data = response.json()
    long_values.append(data['results'][0]['geometry']['location']['lng'])

#### Setting upper and lower limit for Latitude and Longitude

In [56]:
#Extracting the name of outliers

for lat in Latitude:
    if(lat>lat_values[0] or lat<lat_values[1]):
        print(df.loc[df['Latitude']==lat,'Location of accidents\nincluding chainage (km'].values[0])       
for long in Longitude:
    if(long>long_values[0] or long<long_values[1]):
        print(df.loc[df['Longitude']==long,'Location of accidents\nincluding chainage (km'].values[0])

Avtar Dhaba
Vadoda
Old kasara Ghat


In [71]:
#Manually replacing with the correct values

df.loc[df['Location of accidents\nincluding chainage (km']=='Old kasara Ghat','Latitude']=19.67
df.loc[df['Location of accidents\nincluding chainage (km']=='Old kasara Ghat','Longitude']=73.48
df.loc[df['Location of accidents\nincluding chainage (km']=='Vadoda','Latitude']=21.1303534
df.loc[df['Location of accidents\nincluding chainage (km']=='Vadoda','Longitude']=79.322614
df.loc[df['Location of accidents\nincluding chainage (km']=='Avtar Dhaba','Latitude']=21.0245349
df.loc[df['Location of accidents\nincluding chainage (km']=='Avtar Dhaba','Longitude']=75.4548646

In [73]:
#writing to csv file without the dataframe indices
df.to_csv('C:/Users/Admin/Desktop/INTEL_DATA/mh_black_spots.csv', index=False)

#### Note: Same script was applied to all states with minor name changes