This notebook derives weather data for different states of India. The source of the data is https://www1.ncdc.noaa.gov/

In [1]:
import pandas as pd

In [2]:
state_to_code_map = {
    "TN" : 'IN020040900',
    "MH" :  'IN012070800',
    "GJ" : 'IN005010600',
    "DL" : 'IN022023000',
    "RJ" : 'IN019131301',
    "KA" : 'IN009010100'
}

In [3]:
df = pd.read_csv('data/2020.csv',names=['station','id','param','value1','value2','value3','value4','value5'])

In [4]:
# Filter the India stations
india_df = df.loc[df.station.str.contains('^IN')]
uniq_stations = list(india_df['station'].unique())

In [5]:
def station_info():
    f = open('data/india_stations.txt', 'r+')
    my_file_data = f.read()
    f.close()
    station_arr = my_file_data.split('\n')
    stations={}
    name_station={}
    for e in station_arr:
        #print(e.split( ))
        val=e.split()
        #print(len(val))
        stations[val[0]]={}
        stations[val[0]]['lat']=val[1]
        stations[val[0]]['lon']=val[2]
        stations[val[0]]['msl']=val[3]
        stations[val[0]]['name1']=val[4]
        stations[val[0]]['name2']=''
        name_station[val[4]]=val[0]
        if len(val)>5:
            stations[val[0]]['name2']=val[5]
    return (stations,name_station)
    

In [6]:
# Read station info data
(stations,name_station) = station_info()

In [7]:
# Connect station info and data
all_data=[]

def state_data(state_code):
    #state_code='KA'
    sts = state_to_code_map[state_code]
    print(sts)
    local_df = india_df[india_df['station']==sts]
    dct={}
    for index,row in local_df.iterrows():
        if row['id'] not in dct:
            dct[row['id']]={}
            dct[row['id']]['date'] = row['id']
            dct[row['id']]['state_code'] = state_code

        dct[row['id']][row['param']]=row['value1']
    for v in dct.keys():
        all_data.append(dct[v])
    

In [8]:
for v in state_to_code_map:
    state_data(v)

IN020040900
IN012070800
IN005010600
IN022023000
IN019131301
IN009010100


In [9]:
all_df = pd.DataFrame(all_data)

all_df['date'] = pd.to_datetime(all_df.date,format='%Y%m%d')

all_df.PRCP.fillna(0,inplace=True)


In [11]:
all_df

Unnamed: 0,date,state_code,TMAX,TMIN,PRCP,TAVG
0,2020-01-01,TN,303.0,225.0,279.0,257
1,2020-01-02,TN,,234.0,330.0,276
2,2020-01-03,TN,312.0,240.0,0.0,279
3,2020-01-04,TN,312.0,245.0,0.0,280
4,2020-01-05,TN,316.0,,0.0,277
...,...,...,...,...,...,...
795,2020-05-09,KA,,238.0,0.0,291
796,2020-05-10,KA,352.0,240.0,0.0,290
797,2020-05-11,KA,353.0,236.0,0.0,283
798,2020-05-12,KA,340.0,234.0,0.0,276


In [12]:
all_df.to_csv('weather.csv',index=False)