In [1]:
import pandas as pd
import glob
import numpy as np
import re
from tqdm.auto import tqdm
import pickle
from pandarallel import pandarallel
import re

pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
data_in = r"../../sih-raw-data/"
data_out_csv = "../data/csv/"
data_out_comp = "../data/comp/"

In [3]:
pre_monsoon='pem'
post_monsoon='pom'
# stations='st'

### GroundWater district monthly: gw-district-monthly

In [5]:
# ground-water: unit =m

In [6]:
dfs = pd.DataFrame(data=None)
i = 0
for f in tqdm(glob.glob(data_in + r"gw-district-monthly/*")):
    df = pd.read_csv(f, header=1)
    df["loc"] = df.apply(
        lambda x: x["Unnamed: 0"].lower() + "-" + x["Unnamed: 1"].lower(),
        axis=1,
    )
    df = df.iloc[1:, 2:]
    #     print(list(df.columns.values)[-2])
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    #     print(list(df.columns.values)[-1])
    if "Jan" in list(df.columns.values)[-1]:
        df = df.iloc[:, :-1]
    if i == 0:
        df = df.set_index("loc")
        dfs = df
        i = 1
    else:
        dfs = dfs.join(df.set_index("loc"), on="loc")
df = dfs

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




In [5]:
df.columns = pd.to_datetime(df.columns)
df.columns = df.columns.strftime("%Y-%m")
df.columns = sorted(list(df.columns))
st_dis = list(df.index.str.split("-"))
df["state"] = df.apply(lambda x: x.name.split("-")[0], axis=1)
df["district"] = df.apply(lambda x: x.name.split("-")[1], axis=1)
df["location"] = df.index
df = df.set_index("district")
df = df.replace("-", np.NaN)

In [6]:
df.to_csv(data_out_csv + r"gw-district-monthly.csv")
df.to_parquet(
    data_out_comp + r"gw-district-monthly.parquet.gzip", compression="gzip"
)

### Groundwater state monthly: gw-state-monthly

In [7]:
dfs = pd.DataFrame(data=None)
i = 0
for f in tqdm(glob.glob(data_in + r"gw-state-monthly/*")):
    df = pd.read_csv(f, header=1)
    df["state"] = df.apply(lambda x: x["Unnamed: 0"].lower(), axis=1)
    df = df.iloc[1:, 1:]
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    if i == 0:
        df = df.set_index("state")
        dfs = df
        i = 1
    else:
        dfs = dfs.join(df.set_index("state"), on="state")
df = dfs

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




In [8]:
df.columns = pd.to_datetime(df.columns)
df.columns = df.columns.strftime("%Y-%m")
df.columns = sorted(list(df.columns))
df = df.replace("-", np.NaN)

In [10]:
df.to_csv(data_out_csv + r"gw-state-monthly.csv")
df.to_parquet(
    data_out_comp + r"gw-state-monthly.parquet.gzip", compression="gzip"
)

### GW block pre,post,yearly: seasonal-fluctuation-state

In [4]:
df = pd.DataFrame(data=None)

In [5]:
dfs = pd.DataFrame(data=None)
i = 0
for f in tqdm(glob.glob(data_in + r"seasonal-fluctuation-state/*")):
    df = pd.read_excel(f, sheet_name=1, usecols="B:G", header=1)
    df["loc"] = df.apply(
        lambda x: x["State"].lower()
        + "-_-"
        + x["District"].lower()
        + "-_-"
        + x["Block"].lower(),
        axis=1,
    )
    df = (
        df.groupby("loc")
        .agg(
            {
                "Pre-Monsoon": ["mean"],
                "Post-Monsoon": ["mean"],
            }
        )
        .round(2)
    )
    df.columns = ["_".join(col) for col in df.columns.values]
    year = re.search("\d+", f.split("/")[-1]).group(0)
    df[year + "-pem"] = df["Pre-Monsoon_mean"]
    df[year + "-pom"] = df["Post-Monsoon_mean"]
    df = df.drop(
        ["Pre-Monsoon_mean", "Post-Monsoon_mean"], axis=1
    )
    if i == 0:
        dfs = df
    else:
        dfs = dfs.join(df, how="outer")
    i += 1

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




In [6]:
dfs.head()

Unnamed: 0_level_0,1997-pem,1997-pom,2016-pem,2016-pom,2004-pem,2004-pom,2000-pem,2000-pom,2013-pem,2013-pom,...,1995-pem,1995-pom,2006-pem,2006-pom,2007-pem,2007-pom,2002-pem,2002-pom,2017-pem,2017-pom
loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
andaman & nicobar-_-north & middle andaman-_-baratang island,,,3.45,0.09,,,,,,,...,,,,,,,,,1.94,0.32
andaman & nicobar-_-north & middle andaman-_-middle andaman,,,2.61,1.02,,,,,0.99,0.58,...,,,,,,,,,2.38,0.79
andaman & nicobar-_-south andaman-_-havelock island,,,3.08,1.64,,,,,1.8,0.66,...,,,,,,,,,,
andaman & nicobar-_-south andaman-_-neil island,,,4.39,2.27,,,,,3.26,1.24,...,,,,,,,,,,
andaman & nicobar-_-south andaman-_-south andaman,,,2.56,1.18,,,,,1.43,1.03,...,,,,,,,,,2.16,0.86


In [7]:
dfs.shape

(5368, 52)

In [8]:
# df = pd.read_excel(
#     data_in + "annual-fluctuation-state-post-pre/( PRE MONSOON 2019-20.xls",
#     sheet_name=1,
#     usecols="B:G",
#     header=1,
# )
# df["loc"] = df.apply(
#     lambda x: x["State"].lower()
#     + "-_-"
#     + x["District"].lower()
#     + "-_-"
#     + x["Block"].lower(),
#     axis=1,
# )
# df = df.groupby("loc").agg({"2020": ["mean"] }).round(2)
# df.columns = ["_".join(col) for col in df.columns.values]
# df["2020" + "-pem"] = df["2020_mean"]
# # df["2020-st"]=df['Station_count']
# df = df.drop([("2020_mean")], axis=1)
# df.columns
# dfs=dfs.join(df, how="outer")

In [8]:
df = dfs
del dfs

In [9]:
df.columns = sorted(list(df.columns))

In [10]:
# df["2020-pom"]=np.NaN

In [11]:
df.isna().sum()

1994-pem    1196
1994-pom    1196
1995-pem    1044
1995-pom    1044
1996-pem    2027
1996-pom    2027
1997-pem    1425
1997-pom    1425
1998-pem     979
1998-pom     979
1999-pem    1415
1999-pom    1415
2000-pem    1242
2000-pom    1242
2001-pem    1597
2001-pom    1597
2002-pem    1369
2002-pom    1369
2003-pem    1322
2003-pom    1322
2004-pem    1198
2004-pom    1198
2005-pem    1024
2005-pom    1024
2006-pem    1505
2006-pom    1505
2007-pem    1791
2007-pom    1791
2008-pem    1251
2008-pom    1251
2009-pem    1192
2009-pom    1192
2010-pem    1112
2010-pom    1112
2011-pem    1345
2011-pom    1345
2012-pem    2650
2012-pom    2650
2013-pem     705
2013-pom     705
2014-pem    5060
2014-pom    5060
2015-pem    4896
2015-pom    4896
2016-pem    1118
2016-pom    1118
2017-pem    1020
2017-pom    1020
2018-pem    1510
2018-pom    1510
2019-pem     848
2019-pom     848
dtype: int64

In [12]:
df["state"] = df.apply(lambda row: row.name.split("-_-")[0], axis=1)
df["district"] = df.apply(lambda row: row.name.split("-_-")[1], axis=1)
df["block"] = df.apply(lambda row: row.name.split("-_-")[2], axis=1)
df['india']=df.apply(lambda row: 'india',axis=1)

In [13]:
df

Unnamed: 0_level_0,1994-pem,1994-pom,1995-pem,1995-pom,1996-pem,1996-pom,1997-pem,1997-pom,1998-pem,1998-pom,...,2017-pem,2017-pom,2018-pem,2018-pom,2019-pem,2019-pom,state,district,block,india
loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
andaman & nicobar-_-north & middle andaman-_-baratang island,,,3.45,0.09,,,,,,,...,,,,,1.94,0.32,andaman & nicobar,north & middle andaman,baratang island,india
andaman & nicobar-_-north & middle andaman-_-middle andaman,,,2.61,1.02,,,,,0.99,0.58,...,,,,,2.38,0.79,andaman & nicobar,north & middle andaman,middle andaman,india
andaman & nicobar-_-south andaman-_-havelock island,,,3.08,1.64,,,,,1.80,0.66,...,,,,,,,andaman & nicobar,south andaman,havelock island,india
andaman & nicobar-_-south andaman-_-neil island,,,4.39,2.27,,,,,3.26,1.24,...,,,,,,,andaman & nicobar,south andaman,neil island,india
andaman & nicobar-_-south andaman-_-south andaman,,,2.56,1.18,,,,,1.43,1.03,...,,,,,2.16,0.86,andaman & nicobar,south andaman,south andaman,india
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
west bengal-_-uttar dinajpur-_-islampur,4.53,2.86,,,3.97,2.68,3.82,2.65,4.26,1.00,...,,,2.93,1.96,2.41,1.82,west bengal,uttar dinajpur,islampur,india
west bengal-_-uttar dinajpur-_-itahar,4.42,2.74,,,5.01,1.17,3.24,2.26,5.23,2.19,...,4.96,2.29,4.02,1.78,2.56,2.20,west bengal,uttar dinajpur,itahar,india
west bengal-_-uttar dinajpur-_-kaliganj,4.32,2.69,,,3.96,2.01,4.28,2.58,5.61,2.03,...,4.94,2.82,3.60,2.07,5.11,3.22,west bengal,uttar dinajpur,kaliganj,india
west bengal-_-uttar dinajpur-_-karandighi,5.77,4.80,,,3.95,2.26,3.45,2.57,4.61,1.79,...,5.58,3.27,3.79,2.19,,,west bengal,uttar dinajpur,karandighi,india


In [14]:
clmns = df.columns.tolist()
clmns = clmns[-4:] + clmns[:-4]
df = df[clmns]

In [15]:
clmns

['state',
 'district',
 'block',
 'india',
 '1994-pem',
 '1994-pom',
 '1995-pem',
 '1995-pom',
 '1996-pem',
 '1996-pom',
 '1997-pem',
 '1997-pom',
 '1998-pem',
 '1998-pom',
 '1999-pem',
 '1999-pom',
 '2000-pem',
 '2000-pom',
 '2001-pem',
 '2001-pom',
 '2002-pem',
 '2002-pom',
 '2003-pem',
 '2003-pom',
 '2004-pem',
 '2004-pom',
 '2005-pem',
 '2005-pom',
 '2006-pem',
 '2006-pom',
 '2007-pem',
 '2007-pom',
 '2008-pem',
 '2008-pom',
 '2009-pem',
 '2009-pom',
 '2010-pem',
 '2010-pom',
 '2011-pem',
 '2011-pom',
 '2012-pem',
 '2012-pom',
 '2013-pem',
 '2013-pom',
 '2014-pem',
 '2014-pom',
 '2015-pem',
 '2015-pom',
 '2016-pem',
 '2016-pom',
 '2017-pem',
 '2017-pom',
 '2018-pem',
 '2018-pom',
 '2019-pem',
 '2019-pom']

In [16]:
# clmns=clmns[:-2]+[clmns[-1]]+[clmns[-2]]

In [17]:
clmns

['state',
 'district',
 'block',
 'india',
 '1994-pem',
 '1994-pom',
 '1995-pem',
 '1995-pom',
 '1996-pem',
 '1996-pom',
 '1997-pem',
 '1997-pom',
 '1998-pem',
 '1998-pom',
 '1999-pem',
 '1999-pom',
 '2000-pem',
 '2000-pom',
 '2001-pem',
 '2001-pom',
 '2002-pem',
 '2002-pom',
 '2003-pem',
 '2003-pom',
 '2004-pem',
 '2004-pom',
 '2005-pem',
 '2005-pom',
 '2006-pem',
 '2006-pom',
 '2007-pem',
 '2007-pom',
 '2008-pem',
 '2008-pom',
 '2009-pem',
 '2009-pom',
 '2010-pem',
 '2010-pom',
 '2011-pem',
 '2011-pom',
 '2012-pem',
 '2012-pom',
 '2013-pem',
 '2013-pom',
 '2014-pem',
 '2014-pom',
 '2015-pem',
 '2015-pom',
 '2016-pem',
 '2016-pom',
 '2017-pem',
 '2017-pom',
 '2018-pem',
 '2018-pom',
 '2019-pem',
 '2019-pom']

In [18]:
df=df[clmns]

In [19]:
df = df.reset_index(drop=True)

In [20]:
# df['2020-pom'].isna().sum()

In [21]:
# station_col=[str(i)+'-st' for i in range(1994,2021)]

In [22]:
# df_sub=df[station_col]

In [23]:
# df['total-st']=df_sub.max(axis=1)

In [24]:
df

Unnamed: 0,state,district,block,india,1994-pem,1994-pom,1995-pem,1995-pom,1996-pem,1996-pom,...,2015-pem,2015-pom,2016-pem,2016-pom,2017-pem,2017-pom,2018-pem,2018-pom,2019-pem,2019-pom
0,andaman & nicobar,north & middle andaman,baratang island,india,,,3.45,0.09,,,...,,,,,,,,,1.94,0.32
1,andaman & nicobar,north & middle andaman,middle andaman,india,,,2.61,1.02,,,...,,,,,,,,,2.38,0.79
2,andaman & nicobar,south andaman,havelock island,india,,,3.08,1.64,,,...,,,,,,,,,,
3,andaman & nicobar,south andaman,neil island,india,,,4.39,2.27,,,...,,,,,,,,,,
4,andaman & nicobar,south andaman,south andaman,india,,,2.56,1.18,,,...,,,,,,,,,2.16,0.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5363,west bengal,uttar dinajpur,islampur,india,4.53,2.86,,,3.97,2.68,...,,,,,,,2.93,1.96,2.41,1.82
5364,west bengal,uttar dinajpur,itahar,india,4.42,2.74,,,5.01,1.17,...,,,3.70,2.76,4.96,2.29,4.02,1.78,2.56,2.20
5365,west bengal,uttar dinajpur,kaliganj,india,4.32,2.69,,,3.96,2.01,...,,,4.64,2.82,4.94,2.82,3.60,2.07,5.11,3.22
5366,west bengal,uttar dinajpur,karandighi,india,5.77,4.80,,,3.95,2.26,...,,,4.98,2.86,5.58,3.27,3.79,2.19,,


In [25]:
df.to_csv(data_out_csv + r"gw-block-pre-post.csv")
df.to_parquet(
    data_out_comp + r"gw-block-pre-post.parquet.gzip", compression="gzip"
)

### Rainfall

In [9]:
# Rainfall in mm

In [2]:
file_name = []
lat = []
lon = []
elev = []
for f in tqdm(glob.glob("../data/raw-files/rainfall-data/*.csv")):
    df = pd.read_csv(f, index_col=False)
    file_name.append(re.search("\d+", f)[0])
    lon.append(df.iloc[0, 1])
    lat.append(df.iloc[0, 2])
    elev.append(df.iloc[0, 3])
    df = df.iloc[:, [0, 4, 5, 6, 7, 8, 9]]
    df.to_csv(f, index=False)

100%|██████████| 4409/4409 [03:04<00:00, 23.94it/s]


In [3]:
df_new = pd.DataFrame(
    {
        "file-name": file_name,
        "longitude": lon,
        "latitude": lat,
        "elevation": elev,
    }
)
df_new["elevation"] = df_new["elevation"].replace(-9999, np.NaN)

In [14]:
df_new.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    338
dtype: int64

In [19]:
df_new.head()

Unnamed: 0,file-name,longitude,latitude,elevation
0,214841,84.0625,21.387699,174.0
1,367753,75.3125,36.686901,4677.0
2,364772,77.1875,36.374699,4350.0
3,217738,73.75,21.6999,361.0
4,223866,86.5625,22.3244,137.0


In [None]:
df_new.to_csv(r"../data/rainfall-place-attr.csv", index=False)

### Names of districts

In [2]:
df = pd.read_csv(r"../data/rainfall-place-attr.csv")

In [3]:
df

Unnamed: 0,file-name,longitude,latitude,elevation
0,214841,84.0625,21.387699,174.0
1,367753,75.3125,36.686901,4677.0
2,364772,77.1875,36.374699,4350.0
3,217738,73.7500,21.699900,361.0
4,223866,86.5625,22.324400,137.0
...,...,...,...,...
4404,276903,90.3125,27.632299,4350.0
4405,279903,90.3125,27.944500,5346.0
4406,251750,75.0000,25.134399,455.0
4407,205766,76.5625,20.451000,479.0


In [4]:
import googlemaps

tqdm.pandas()

In [5]:
maps = googlemaps.Client(key="AIzaSyAOxtei3qW-NTc51_unIjVJnjXfHJZ9RlE")

In [17]:
print(maps.reverse_geocode((36.686901, 75.3125)))

[]


In [6]:
df["location"] = df.progress_apply(
    lambda row: maps.reverse_geocode((row["latitude"], row["longitude"])),
    axis=1,
)

HBox(children=(FloatProgress(value=0.0, max=4409.0), HTML(value='')))




In [8]:
with open(r"../data/rainfall-place-attr-district-raw.pickle", "wb") as f:
    pickle.dump(df, f)

In [9]:
with open(r"../data/rainfall-place-attr-district-raw.pickle", "rb") as f:
    df = pickle.load(f)

In [19]:
# location is empty for remote locations

In [13]:
df["location"] = df.parallel_apply(
    lambda row: np.NaN if len(row["location"]) == 0 else row["location"], axis=1
)

In [22]:
df = df.dropna(subset=["location"])

In [25]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    235
location       0
dtype: int64

In [46]:
def get_value(row, type_v):
    val = ""
    if len(row["location"]) > 0:
        for i in row["location"]:
            if type_v in i["types"]:
                return i["address_components"][0]["long_name"]
    return np.NaN

In [47]:
df["country"] = df.apply(get_value, args=("country",), axis=1)
df["state"] = df.apply(get_value, args=("administrative_area_level_1",), axis=1)
df["district"] = df.apply(
    get_value, args=("administrative_area_level_2",), axis=1
)

HBox(children=(FloatProgress(value=0.0, max=4190.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [56]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    235
location       0
country      172
state        405
district     939
dtype: int64

In [80]:
# order followed remove nulls:
# country null or not india
# country null=ocean, mountain
# elevation null and district null and state null= ocean or mountain
# elevation null assigned value
# state,city,country=null=> mountains or ocean
# district null= forest/ocean/water body/mountain
# state null= forest/ocean/water body/mountain

In [63]:
df = df[(df["country"] == "India") | (df["country"].isna())]

In [64]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    190
location       0
country      172
state        363
district     373
dtype: int64

In [65]:
df.drop(
    df[
        df["elevation"].isna() & df["district"].isna() & df["state"].isna()
    ].index,
    inplace=True,
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [66]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation      3
location       0
country      172
state        176
district     186
dtype: int64

In [77]:
df[df["elevation"].isna()]

Unnamed: 0,file-name,longitude,latitude,elevation,location,country,state,district
2761,92794,79.375,9.21075,,[{'address_components': [{'long_name': 'Ramesw...,India,Tamil Nadu,


In [70]:
df.reset_index(drop=True, inplace=True)

In [79]:
df.iloc[1839, 6] = "Gujarat"
df.iloc[1839, 3] = 3
df.iloc[1944, 3] = 4
df.dropna(subset=["elevation"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [82]:
df[df["state"].isna() & df["district"].notna()]["state"] = "West Bengal"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
df[(df["country"].isna()) & (df["state"].isna()) & (df["district"].isna())]

Unnamed: 0,file-name,longitude,latitude,elevation,location,country,state,district
5,329756,75.6250,32.940102,3471.0,[{'address_components': [{'long_name': '182222...,,,
17,336753,75.3125,33.564602,2317.0,[{'address_components': [{'long_name': 'Nation...,,,
26,361741,74.0625,36.062401,2102.0,[{'address_components': [{'long_name': 'Gilgit...,,,
29,345766,76.5625,34.501301,3990.0,[{'address_components': [{'long_name': '194106...,,,
53,342763,76.2500,34.189098,4201.0,[{'address_components': [{'long_name': '194109...,,,
...,...,...,...,...,...,...,...,...
2691,339778,77.8125,33.876801,3945.0,[{'address_components': [{'long_name': '194201...,,,
2710,348775,77.5000,34.813499,3268.0,[{'address_components': [{'long_name': 'Unname...,,,
2721,279928,92.8125,27.944500,4244.0,[{'address_components': [{'long_name': '790102...,,,
2736,279931,93.1250,27.944500,2362.0,[{'address_components': [{'long_name': '791118...,,,


In [88]:
df.dropna(subset=["country"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [91]:
df.dropna(subset=["state"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [94]:
df.dropna(subset=["district"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [96]:
with open(
    r"../data/rainfall-place-attr-district-raw-null-removed.pickle", "wb"
) as f:
    pickle.dump(df, f)

In [123]:
df.to_csv(r"../data/rainfall-place-attr-district.csv", index=False)

In [119]:
df = pd.read_csv(r"../data/rainfall-place-attr-district.csv")

In [122]:
df.drop(columns=["location"], inplace=True)

In [118]:
# save considered files into new folder rainfall-data-final-till-2014

In [116]:
def extract_useful(row):
    temp = pd.read_csv(
        r"../data/raw-files/rainfall-data/weatherdata-"
        + str(row["file-name"])
        + ".csv"
    )
    temp.to_csv(
        "../data/raw-files/rainfall-data-final-till-2014/weatherdata-"
        + str(row["file-name"])
        + ".csv",
        index=False,
    )

In [117]:
t = df.progress_apply(extract_useful, axis=1)

HBox(children=(FloatProgress(value=0.0, max=2615.0), HTML(value='')))


