# CORONA VIRUS (COVID-19)

## Import Libraries

In [1]:
import os #for files and folder handling
import re #for reguler expression
import glob #for listing files in a folder
import requests # for getting web contents
import pandas as pd # storing and analysing data
from bs4 import BeautifulSoup # for scraping web contents
import numpy as np # for numerical analysis
from datetime import datetime #for date and time operations

## Web Scrapping

In [2]:
# url of Ministry of Health and Family Welfare where data is available
url = 'https://www.mohfw.gov.in/'

# getting web data
page = requests.get(url)

# parse web data
soup = BeautifulSoup(page.content, "html.parser")

In [3]:
#getting table

html_thead = soup.find_all('thead')[-1]
head = [tr for tr in html_thead.find_all('tr')]

html_tbody = soup.find_all('tbody')[-1]
text = [tr for tr in html_tbody.find_all('tr')]

headings = []

for tr in head:
    th = tr.find_all(['th'])
    row = [i.text for i in th]
    headings.append(row)
    
content = []
for tr in text:
    td = tr.find_all(['td'])
    row = [i.text for i in td]
    content.append(row)


In [4]:
content

[['1', 'Andaman and Nicobar Islands', '55', '152', '0', '207'],
 ['2', 'Andhra Pradesh', '28800', '24228', '696', '53724'],
 ['3', 'Arunachal Pradesh', '502', '285', '3', '790'],
 ['4', 'Assam', '8229', '17095', '58', '25382'],
 ['5', 'Bihar', '9996', '17433', '217', '27646'],
 ['6', 'Chandigarh', '207', '518', '12', '737'],
 ['7', 'Chhattisgarh', '1592', '3944', '25', '5561'],
 ['8', 'Dadra and Nagar Haveli and Daman and Diu', '234', '448', '2', '684'],
 ['9', 'Delhi', '15166', '104918', '3663', '123747'],
 ['10', 'Goa', '1469', '2361', '23', '3853'],
 ['11', 'Gujarat', '11513', '35678', '2162', '49353'],
 ['12', 'Haryana', '6277', '20226', '355', '26858'],
 ['13', 'Himachal Pradesh', '553', '1067', '11', '1631'],
 ['14', 'Jammu and Kashmir', '6122', '8274', '254', '14650'],
 ['15', 'Jharkhand', '2893', '2810', '53', '5756'],
 ['16', 'Karnataka', '42222', '23795', '1403', '67420'],
 ['17', 'Kerala', '7615', '5616', '43', '13274'],
 ['18', 'Ladakh', '186', '1007', '2', '1195'],
 ['19',

In [5]:
#save data in data frames

#last five row contains unwanted information
df= pd.DataFrame(content[:-5], columns=headings[0])

# Drop 'S. No.' column
df.drop('S. No.', axis=1, inplace=True)

# print df
#df

## Add Date, Latitiude and Longitude

In [6]:
#add today's date and create new column
now  = datetime.now()
df['Date'] = now.strftime("%m/%d/%Y") 
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df.head()

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*,Date
0,Andaman and Nicobar Islands,55,152,0,207,2020-07-21
1,Andhra Pradesh,28800,24228,696,53724,2020-07-21
2,Arunachal Pradesh,502,285,3,790,2020-07-21
3,Assam,8229,17095,58,25382,2020-07-21
4,Bihar,9996,17433,217,27646,2020-07-21


In [7]:
#add latitude and longitude
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 
       'Maharashtra':19.7515, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670}

long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 
        'Maharashtra':75.7139, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662}

df['Latitude'] = df['Name of State / UT'].map(lat)
df['Longitude'] = df['Name of State / UT'].map(long)
df.head()

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,55,152,0,207,2020-07-21,11.7401,92.6586
1,Andhra Pradesh,28800,24228,696,53724,2020-07-21,15.9129,79.74
2,Arunachal Pradesh,502,285,3,790,2020-07-21,28.218,94.7278
3,Assam,8229,17095,58,25382,2020-07-21,26.2006,92.9376
4,Bihar,9996,17433,217,27646,2020-07-21,25.0961,85.3131


## Data Cleaning

In [8]:
#rename column names
# =====
# Changing Name of State / UT to States
df = df.rename(columns={'Name of State / UT':'States'})

# Active Cases* to Active Cases
df = df.rename(columns={'Active Cases*':'Active Cases'})

# Deaths** to Deaths
df = df.rename(columns={'Deaths**':'Death'})
df = df.rename(columns=lambda x: re.sub('Deaths \( more than 70% cases due to comorbidities \)',
                                                      'Deaths', x))
df = df.rename(columns={'Deaths':'Death'})

# Total Confirmes cases* to Total Cases
df = df.rename(columns=lambda x: re.sub('Total Confirmed cases \(Including .* foreign Nationals\) ',
                                                      'Total Cases',x))
df = df.rename(columns={'Total Confirmed cases*': 'Total Cases'})

# Cured/Discharged/Migrated* to Recovered
df = df.rename(columns={'Cured/Discharged/Migrated*':'Recovered'})


# remove extra characters from 'Name of State/UT' column
df['States'] = df['States'].str.replace('#', '')

# remove extra characters from 'Name of State/UT' column
df['Death'] = df['Death'].str.replace('#', '')

# drop active cases colum
df = df.drop('Active Cases', axis=1)

## Save Data

In [9]:
# Created hidden folder
my_folder = './.daily_update/'
if not os.path.exists(my_folder):
    os.makedirs(my_folder)

# saving data as year_month_day.csv
file_name = my_folder + now.strftime("%Y_%m_%d") + '.csv'

#saving data frame in csv file
df.to_csv(file_name, index=False)

## Merge all CSV files

In [10]:
#reading all daily cases csv file from folder
csv_files = glob.glob(my_folder + '*.csv')

all_data = []
for i in csv_files:
    temp = pd.read_csv(i)
    all_data.append(temp)

In [11]:
#merging(concat) all daily csv files in one data frame 
data = pd.concat(all_data, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
data['Date'] = pd.to_datetime(data['Date'])

#creating new colum active cases from total cases - recovered - deaths
data['Active'] = data['Total Cases'] - data['Recovered'] - data['Death']

#sorting data according to date and states name
data = data.sort_values(['Date', 'States']).reset_index(drop=True)
#data

In [12]:
#saving data in csv file
data.to_csv('COVID-19-INDIA.csv', index=False)