In [1]:
import pandas as pd
import numpy as np
import math

def file_reader(file_location):
    if(file_location.endswith('csv')):
        return pd.read_csv( file_location , engine = 'python', index_col = 0)
    elif (file_location.endswith('tsv')):
        return pd.read_csv( file_location , engine = 'python' ,sep = '\t')

def df_reader(file_location):
    df = file_reader(file_location)
    df = df.drop(['Indicator Name','Indicator Code','Country Code'], axis = 1)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df.T

In [2]:
def year_float2str(year_quarter):
    if ( year_quarter - int(year_quarter) == 0):
        return ( str( int(year_quarter) - 1 ) + "Q4")
    elif ( year_quarter - int(year_quarter) == 0.25):
        return ( str( int(year_quarter) ) + "Q1")
    elif ( year_quarter - int(year_quarter) == 0.5):
        return ( str( int(year_quarter) ) + "Q2")
    elif( year_quarter - int(year_quarter) == 0.75):
        return ( str( int(year_quarter) ) + "Q3")
    
def year_str2float(year_quarter):
    if ( year_quarter.endswith('Q1') ):
        return int(year_quarter[:-2]) + 0.25
    elif( year_quarter.endswith('Q2') ):
        return int(year_quarter[:-2]) + 0.50
    elif( year_quarter.endswith('Q3') ):
        return int(year_quarter[:-2]) + 0.75
    elif( year_quarter.endswith('Q4') ):
        return int(year_quarter[:-2]) + 1.00

In [3]:
xf = np.linspace(2000.25, 2020.75, 83)
x_index = [ year_float2str( xvalue ) for xvalue in xf]

def fill_missing_values(dflist):
    xt,yt = [],[]
    for i in dflist.index:
        if( not np.isnan( dflist[i] )):
            xt.append( float(i) )
            yt.append( dflist[i] ) 
    from scipy.interpolate import InterpolatedUnivariateSpline
    s = InterpolatedUnivariateSpline(xt, yt, k=1)
    yf = s(xf)  
    return yf

def preprocess_dataset(df,x):
    updateddf = pd.DataFrame()
    updateddf['Year/Quarter'] = x_index
    for country in sorted( set( df.keys())  - set(['Year/Quarter']) ):
        filled_list = fill_missing_values(df[country])     
        updateddf[country] = filled_list
    updateddf = updateddf.set_index('Year/Quarter')
    return updateddf

In [4]:
# Read Datasets in Dataframe
strlist = [
    'above65.csv',
    'area.csv',
    'literacy_rate.csv',
    'mobile_users.csv',
    'total_pop.csv',
    'unemployed.csv',
    'GDP_Change.csv'
      ]

In [5]:
for loc in strlist:
    print(loc)
    
    if(loc == 'GDP_Change.csv'):
        df = file_reader('dataset/Downloaded Dataset/'+loc)
        droplist = []
        for country in df.keys():
            if( np.isnan( df[country]['2020Q2'] ) ):
                droplist.append(country)
        df       = df.drop(droplist, axis = 1)
        index    = [year_str2float(val) for val in df.index]
        df['index'] = index
        df       = df.set_index('index')
        outdf    = preprocess_dataset(df,index)
        outdf    = outdf[:-1]
    
    else:
        df       = df_reader('dataset/Downloaded Dataset/'+loc)
        df       = df.dropna(axis=1, thresh=5)
        outdf    = preprocess_dataset(df,df.index)
       

    if( loc == 'unemployed.csv'):
        df2       = file_reader('dataset/Downloaded Dataset/india-unemployment-rate.csv')
        filled_list = fill_missing_values(df2['India'])
        outdf['India'] = filled_list
        
    outdf.to_csv('dataset/'+loc)

above65.csv
area.csv
literacy_rate.csv
mobile_users.csv
total_pop.csv
unemployed.csv
GDP_Change.csv
