# Preprocessing of the data
Pre-processing of the data involves the following steps:
1. Cleaning up the dataset - checking for and removing NaN or NA values
2. Fixing the date columns of the files from "xx-yy-mm-dd" format to python dateTime format
3. Remove unnecessary columns
4. Concatenate dataframes based on need
5. Export preprocessed dataframe as csv (??)

## Call the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Import the [data](https://drive.google.com/open?id=1G_StENIk74Ym4dGJ2cGy1uIzhHvbS10W)
5 cities - City_A, City_B, City_C, City_D, City_E <br>
6 files per city - density, grid_attr, infection, migration, transfer, weather

In [2]:
# City A
density_A = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_A/density.csv")
grid_attr_A = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_A/grid_attr.csv")
infection_A = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_A/infection.csv")
migration_A = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_A/migration.csv")
transfer_A = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_A/transfer.csv")
weather_A = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_A/weather.csv")

In [3]:
# City B
density_B = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_B/density.csv")
grid_attr_B = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_B/grid_attr.csv")
infection_B = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_B/infection.csv")
migration_B = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_B/migration.csv")
transfer_B = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_B/transfer.csv")
weather_B = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_B/weather.csv")

In [4]:
# City C
density_C = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_C/density.csv")
grid_attr_C = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_C/grid_attr.csv")
infection_C = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_C/infection.csv")
migration_C = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_C/migration.csv")
transfer_C = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_C/transfer.csv")
weather_C = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_C/weather.csv")

In [5]:
# City D
density_D = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_D/density.csv")
grid_attr_D = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_D/grid_attr.csv")
infection_D = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_D/infection.csv")
migration_D = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_D/migration.csv")
transfer_D = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_D/transfer.csv")
weather_D = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_D/weather.csv")

In [6]:
# City E
density_E = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_E/density.csv")
grid_attr_E = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_E/grid_attr.csv")
infection_E = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_E/infection.csv")
migration_E = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_E/migration.csv")
transfer_E = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_E/transfer.csv")
weather_E = pd.read_csv("/Users/vineethjason/Desktop/chinaCompetitionAI/train_data/city_E/weather.csv")

## Add Column Names
Original files did not have column names - column names added from [here](https://aistudio.baidu.com/aistudio/competition/detail/36?alertTip=The%20invitation%20code%20has%20been%20used%2C%20please%20get%20a%20new%20one.&lang=en&ticket=4976003d8f3746f4ad5728da43650b90)

In [7]:
# column names:
density_columns = ["date","hour","longitude","latitude","population_flow_index"]
grid_attr_columns = ["grid_x","grid_y","region_id"]
infection_columns = ["city_id","region_id","date","number_of_newly_infected_persons"]
migration_columns = ["migration_date","migration_departure_city","migration_arrival_city","migration_scale_index"]
transfer_columns = ["hour","longitude_of_departure","latitude_departure","longitude_arrival","latitude_arrival","transfer_intensity"]
weather_columns = ["date","hour","temperature","humidity","wind_direction","wind_speed","wind_force","weather"]

# City A
density_A.columns = density_columns
grid_attr_A.columns = grid_attr_columns
infection_A.columns = infection_columns
migration_A.columns = migration_columns
transfer_A.columns = transfer_columns
weather_A.columns = weather_columns

# City B
density_B.columns = density_columns
grid_attr_B.columns = grid_attr_columns
infection_B.columns = infection_columns
migration_B.columns = migration_columns
transfer_B.columns = transfer_columns
weather_B.columns = weather_columns

# City C
density_C.columns = density_columns
grid_attr_C.columns = grid_attr_columns
infection_C.columns = infection_columns
migration_C.columns = migration_columns
transfer_C.columns = transfer_columns
weather_C.columns = weather_columns

# City D
density_D.columns = density_columns
grid_attr_D.columns = grid_attr_columns
infection_D.columns = infection_columns
migration_D.columns = migration_columns
transfer_D.columns = transfer_columns
weather_D.columns = weather_columns

# City E
density_E.columns = density_columns
grid_attr_E.columns = grid_attr_columns
infection_E.columns = infection_columns
migration_E.columns = migration_columns
transfer_E.columns = transfer_columns
weather_E.columns = weather_columns

## Checking Dimensions of data
Check the dimensions of the data to see how each dataframe is distributed

In [8]:
# City A
print(density_A.shape)
print(grid_attr_A.shape)
print(infection_A.shape)
print(migration_A.shape)
print(transfer_A.shape)
print(weather_A.shape)

# City B
print(density_B.shape)
print(grid_attr_B.shape)
print(infection_B.shape)
print(migration_B.shape)
print(transfer_B.shape)
print(weather_B.shape)

# City C
print(density_C.shape)
print(grid_attr_C.shape)
print(infection_C.shape)
print(migration_C.shape)
print(transfer_C.shape)
print(weather_C.shape)

# City D
print(density_D.shape)
print(grid_attr_D.shape)
print(infection_D.shape)
print(migration_D.shape)
print(transfer_D.shape)
print(weather_D.shape)

# City E
print(density_E.shape)
print(grid_attr_E.shape)
print(infection_E.shape)
print(migration_E.shape)
print(transfer_E.shape)
print(weather_E.shape)

(35906204, 5)
(11799, 3)
(5309, 4)
(359, 4)
(5670547, 6)
(1079, 8)
(20518124, 5)
(2991, 3)
(1349, 4)
(359, 4)
(1643953, 6)
(1079, 8)
(23761672, 5)
(13499, 3)
(6074, 4)
(359, 4)
(3808928, 6)
(1079, 8)
(25368550, 5)
(7499, 3)
(3374, 4)
(359, 4)
(3258700, 6)
(1079, 8)
(11019371, 5)
(3399, 3)
(1529, 4)
(359, 4)
(776609, 6)
(1079, 8)


## Convert 'date' column to true dateTime format
convert the date column values from 'xx-yy-mm-dd' format to python dateTime format using the 'datetime' package
Datasets with date column:
1. density.csv
2. infection.csv
3. migration.csv
4. weather.csv

Removes the original 'date' column and replaces it with 'dateTimeTrue' column where applicable ('migration_date' to 'migrationDateTimeTrue')

### City A

In [9]:
# Density
density_A.date = density_A.date.apply(str)
density_A["newDate"] = density_A.date.str[2:]
dateList_A = density_A["newDate"].tolist()
newDateList_A = []

for d in dateList_A:
    dates_A = datetime.strptime(d, '%y%m%d')
    newDateList_A.append(dates_A)

density_A["dateTimeTrue"] = newDateList_A
density_A = density_A.loc[:, ["dateTimeTrue","hour","longitude","latitude","population_flow_index"]]

In [11]:
# Infection
infection_A.date = infection_A.date.apply(str)
infection_A["newDate"] = infection_A.date.str[2:]
dateList_A = infection_A["newDate"].tolist()
newDateList_A = []

for d in dateList_A:
    dates_A = datetime.strptime(d, '%y%m%d')
    newDateList_A.append(dates_A)

infection_A["dateTimeTrue"] = newDateList_A
infection_A = infection_A.loc[:, ["city_id","region_id","dateTimeTrue","number_of_newly_infected_persons"]]

In [14]:
# Migration
migration_A.migration_date = migration_A.migration_date.apply(str)
migration_A["newMigrationDate"] = migration_A.migration_date.str[2:]
dateList_A = migration_A["newMigrationDate"].tolist()
newDateList_A = []

for d in dateList_A:
    dates_A = datetime.strptime(d, '%y%m%d')
    newDateList_A.append(dates_A)

migration_A["migrationDateTimeTrue"] = newDateList_A
migration_A = migration_A.loc[:, ["migrationDateTimeTrue","migration_departure_city","migration_arrival_city","migration_scale_index"]]

In [15]:
# Weather
weather_A.date = weather_A.date.apply(str)
weather_A["newDate"] = weather_A.date.str[2:]
dateList_A = weather_A["newDate"].tolist()
newDateList_A = []

for d in dateList_A:
    dates_A = datetime.strptime(d, '%y%m%d')
    newDateList_A.append(dates_A)

weather_A["dateTimeTrue"] = newDateList_A
weather_A = weather_A.loc[:, ["dateTimeTrue","hour","temperature","humidity","wind_direction","wind_speed","wind_force","weather"]]

### City B

In [30]:
# Density
density_B.date = density_B.date.apply(str)
density_B["newDate"] = density_B.date.str[2:]
dateList_B = density_B["newDate"].tolist()
newDateList_B = []

for d in dateList_B:
    dates_B = datetime.strptime(d, '%y%m%d')
    newDateList_B.append(dates_B)

density_B["dateTimeTrue"] = newDateList_B
density_B = density_B.loc[:, ["dateTimeTrue","hour","longitude","latitude","population_flow_index"]]

In [18]:
# Infection
infection_B.date = infection_B.date.apply(str)
infection_B["newDate"] = infection_B.date.str[2:]
dateList_B = infection_B["newDate"].tolist()
newDateList_B = []

for d in dateList_B:
    dates_B = datetime.strptime(d, '%y%m%d')
    newDateList_B.append(dates_B)

infection_B["dateTimeTrue"] = newDateList_B
infection_B = infection_B.loc[:, ["city_id","region_id","dateTimeTrue","number_of_newly_infected_persons"]]

In [19]:
# Migration
migration_B.migration_date = migration_B.migration_date.apply(str)
migration_B["newMigrationDate"] = migration_B.migration_date.str[2:]
dateList_B = migration_B["newMigrationDate"].tolist()
newDateList_B = []

for d in dateList_B:
    dates_B = datetime.strptime(d, '%y%m%d')
    newDateList_B.append(dates_B)

migration_B["migrationDateTimeTrue"] = newDateList_B
migration_B = migration_B.loc[:, ["migrationDateTimeTrue","migration_departure_city","migration_arrival_city","migration_scale_index"]]

In [20]:
# Weather
weather_B.date = weather_B.date.apply(str)
weather_B["newDate"] = weather_B.date.str[2:]
dateList_B = weather_B["newDate"].tolist()
newDateList_B = []

for d in dateList_B:
    dates_B = datetime.strptime(d, '%y%m%d')
    newDateList_B.append(dates_B)

weather_B["dateTimeTrue"] = newDateList_B
weather_B = weather_B.loc[:, ["dateTimeTrue","hour","temperature","humidity","wind_direction","wind_speed","wind_force","weather"]]

### City C

In [31]:
# Density
density_C.date = density_C.date.apply(str)
density_C["newDate"] = density_C.date.str[2:]
dateList_C = density_C["newDate"].tolist()
newDateList_C = []

for d in dateList_C:
    dates_C = datetime.strptime(d, '%y%m%d')
    newDateList_C.append(dates_C)

density_C["dateTimeTrue"] = newDateList_C
density_C = density_C.loc[:, ["dateTimeTrue","hour","longitude","latitude","population_flow_index"]]

In [21]:
# Infection
infection_C.date = infection_C.date.apply(str)
infection_C["newDate"] = infection_C.date.str[2:]
dateList_C = infection_C["newDate"].tolist()
newDateList_C = []

for d in dateList_C:
    dates_C = datetime.strptime(d, '%y%m%d')
    newDateList_C.append(dates_C)

infection_C["dateTimeTrue"] = newDateList_C
infection_C = infection_C.loc[:, ["city_id","region_id","dateTimeTrue","number_of_newly_infected_persons"]]

In [22]:
# Migration
migration_C.migration_date = migration_C.migration_date.apply(str)
migration_C["newMigrationDate"] = migration_C.migration_date.str[2:]
dateList_C = migration_C["newMigrationDate"].tolist()
newDateList_C = []

for d in dateList_C:
    dates_C = datetime.strptime(d, '%y%m%d')
    newDateList_C.append(dates_C)

migration_C["migrationDateTimeTrue"] = newDateList_C
migration_C = migration_C.loc[:, ["migrationDateTimeTrue","migration_departure_city","migration_arrival_city","migration_scale_index"]]

In [23]:
# Weather
weather_C.date = weather_C.date.apply(str)
weather_C["newDate"] = weather_C.date.str[2:]
dateList_C = weather_C["newDate"].tolist()
newDateList_C = []

for d in dateList_C:
    dates_C = datetime.strptime(d, '%y%m%d')
    newDateList_C.append(dates_C)

weather_C["dateTimeTrue"] = newDateList_C
weather_C = weather_C.loc[:, ["dateTimeTrue","hour","temperature","humidity","wind_direction","wind_speed","wind_force","weather"]]

### City D

In [32]:
# Density
density_D.date = density_D.date.apply(str)
density_D["newDate"] = density_D.date.str[2:]
dateList_D = density_D["newDate"].tolist()
newDateList_D = []

for d in dateList_D:
    dates_D = datetime.strptime(d, '%y%m%d')
    newDateList_D.append(dates_D)

density_D["dateTimeTrue"] = newDateList_D
density_D = density_D.loc[:, ["dateTimeTrue","hour","longitude","latitude","population_flow_index"]]

In [24]:
# Infection
infection_D.date = infection_D.date.apply(str)
infection_D["newDate"] = infection_D.date.str[2:]
dateList_D = infection_D["newDate"].tolist()
newDateList_D = []

for d in dateList_D:
    dates_D = datetime.strptime(d, '%y%m%d')
    newDateList_D.append(dates_D)

infection_D["dateTimeTrue"] = newDateList_D
infection_D = infection_D.loc[:, ["city_id","region_id","dateTimeTrue","number_of_newly_infected_persons"]]

In [25]:
# Migration
migration_D.migration_date = migration_D.migration_date.apply(str)
migration_D["newMigrationDate"] = migration_D.migration_date.str[2:]
dateList_D = migration_D["newMigrationDate"].tolist()
newDateList_D = []

for d in dateList_D:
    dates_D = datetime.strptime(d, '%y%m%d')
    newDateList_D.append(dates_D)

migration_D["migrationDateTimeTrue"] = newDateList_D
migration_D = migration_D.loc[:, ["migrationDateTimeTrue","migration_departure_city","migration_arrival_city","migration_scale_index"]]

In [26]:
# Weather
weather_D.date = weather_D.date.apply(str)
weather_D["newDate"] = weather_D.date.str[2:]
dateList_D = weather_D["newDate"].tolist()
newDateList_D = []

for d in dateList_D:
    dates_D = datetime.strptime(d, '%y%m%d')
    newDateList_D.append(dates_D)

weather_D["dateTimeTrue"] = newDateList_D
weather_D = weather_D.loc[:, ["dateTimeTrue","hour","temperature","humidity","wind_direction","wind_speed","wind_force","weather"]]

### City E

In [33]:
# Density
density_E.date = density_E.date.apply(str)
density_E["newDate"] = density_E.date.str[2:]
dateList_E = density_E["newDate"].tolist()
newDateList_E = []

for d in dateList_E:
    dates_E = datetime.strptime(d, '%y%m%d')
    newDateList_E.append(dates_E)

density_E["dateTimeTrue"] = newDateList_E
density_E = density_E.loc[:, ["dateTimeTrue","hour","longitude","latitude","population_flow_index"]]

In [27]:
# Infection
infection_E.date = infection_E.date.apply(str)
infection_E["newDate"] = infection_E.date.str[2:]
dateList_E = infection_E["newDate"].tolist()
newDateList_E = []

for d in dateList_E:
    dates_E = datetime.strptime(d, '%y%m%d')
    newDateList_E.append(dates_E)

infection_E["dateTimeTrue"] = newDateList_E
infection_E = infection_E.loc[:, ["city_id","region_id","dateTimeTrue","number_of_newly_infected_persons"]]

In [28]:
# Migration
migration_E.migration_date = migration_E.migration_date.apply(str)
migration_E["newMigrationDate"] = migration_E.migration_date.str[2:]
dateList_E = migration_E["newMigrationDate"].tolist()
newDateList_E = []

for d in dateList_E:
    dates_E = datetime.strptime(d, '%y%m%d')
    newDateList_E.append(dates_E)

migration_E["migrationDateTimeTrue"] = newDateList_E
migration_E = migration_E.loc[:, ["migrationDateTimeTrue","migration_departure_city","migration_arrival_city","migration_scale_index"]]

In [29]:
# Weather
weather_E.date = weather_E.date.apply(str)
weather_E["newDate"] = weather_E.date.str[2:]
dateList_E = weather_E["newDate"].tolist()
newDateList_E = []

for d in dateList_E:
    dates_E = datetime.strptime(d, '%y%m%d')
    newDateList_E.append(dates_E)

weather_E["dateTimeTrue"] = newDateList_E
weather_E = weather_E.loc[:, ["dateTimeTrue","hour","temperature","humidity","wind_direction","wind_speed","wind_force","weather"]]

## Concatenate Dataframes
Dataframes are aggregated based on (??)