In [12]:
import pandas as pd
import glob
import numpy as np
import re
from tqdm.auto import tqdm
import pickle
from pandarallel import pandarallel

pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### GroundWater

In [2]:
# ground-water: unit =mm

In [3]:
gd_dfs = []
for f in glob.glob("../data/groundwater/*"):
    df = pd.read_csv(f, header=1)
    df["loc"] = df.apply(
        lambda x: x["Unnamed: 0"] + "-" + x["Unnamed: 1"], axis=1
    )
    df = df.iloc[1:, 2:]
    df = df.set_index("loc").T
    df.index = pd.to_datetime(df.index)
    gd_dfs.append(df)
df = pd.concat(gd_dfs)
df = df.sort_index()
df = df.replace("-", np.NaN)
df = df.apply(pd.to_numeric)
df = df * 1000
df["date"] = df.index
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

In [5]:
df.to_csv(r"../data/groundwater.csv", index=False)

### Rainfall

In [9]:
# Rainfall in mm

In [2]:
file_name = []
lat = []
lon = []
elev = []
for f in tqdm(glob.glob("../data/raw-files/rainfall-data/*.csv")):
    df = pd.read_csv(f, index_col=False)
    file_name.append(re.search("\d+", f)[0])
    lon.append(df.iloc[0, 1])
    lat.append(df.iloc[0, 2])
    elev.append(df.iloc[0, 3])
    df = df.iloc[:, [0, 4, 5, 6, 7, 8, 9]]
    df.to_csv(f, index=False)

100%|██████████| 4409/4409 [03:04<00:00, 23.94it/s]


In [3]:
df_new = pd.DataFrame(
    {
        "file-name": file_name,
        "longitude": lon,
        "latitude": lat,
        "elevation": elev,
    }
)
df_new["elevation"] = df_new["elevation"].replace(-9999, np.NaN)

In [14]:
df_new.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    338
dtype: int64

In [19]:
df_new.head()

Unnamed: 0,file-name,longitude,latitude,elevation
0,214841,84.0625,21.387699,174.0
1,367753,75.3125,36.686901,4677.0
2,364772,77.1875,36.374699,4350.0
3,217738,73.75,21.6999,361.0
4,223866,86.5625,22.3244,137.0


In [None]:
df_new.to_csv(r"../data/rainfall-place-attr.csv", index=False)

### Names of districts

In [2]:
df = pd.read_csv(r"../data/rainfall-place-attr.csv")

In [3]:
df

Unnamed: 0,file-name,longitude,latitude,elevation
0,214841,84.0625,21.387699,174.0
1,367753,75.3125,36.686901,4677.0
2,364772,77.1875,36.374699,4350.0
3,217738,73.7500,21.699900,361.0
4,223866,86.5625,22.324400,137.0
...,...,...,...,...
4404,276903,90.3125,27.632299,4350.0
4405,279903,90.3125,27.944500,5346.0
4406,251750,75.0000,25.134399,455.0
4407,205766,76.5625,20.451000,479.0


In [4]:
import googlemaps

tqdm.pandas()

In [5]:
maps = googlemaps.Client(key="AIzaSyAOxtei3qW-NTc51_unIjVJnjXfHJZ9RlE")

In [17]:
print(maps.reverse_geocode((36.686901, 75.3125)))

[]


In [6]:
df["location"] = df.progress_apply(
    lambda row: maps.reverse_geocode((row["latitude"], row["longitude"])),
    axis=1,
)

HBox(children=(FloatProgress(value=0.0, max=4409.0), HTML(value='')))




In [8]:
with open(r"../data/rainfall-place-attr-district-raw.pickle", "wb") as f:
    pickle.dump(df, f)

In [9]:
with open(r"../data/rainfall-place-attr-district-raw.pickle", "rb") as f:
    df = pickle.load(f)

In [19]:
# location is empty for remote locations

In [13]:
df["location"] = df.parallel_apply(
    lambda row: np.NaN if len(row["location"]) == 0 else row["location"], axis=1
)

In [22]:
df = df.dropna(subset=["location"])

In [25]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    235
location       0
dtype: int64

In [46]:
def get_value(row, type_v):
    val = ""
    if len(row["location"]) > 0:
        for i in row["location"]:
            if type_v in i["types"]:
                return i["address_components"][0]["long_name"]
    return np.NaN

In [47]:
df["country"] = df.apply(get_value, args=("country",), axis=1)
df["state"] = df.apply(get_value, args=("administrative_area_level_1",), axis=1)
df["district"] = df.apply(
    get_value, args=("administrative_area_level_2",), axis=1
)

HBox(children=(FloatProgress(value=0.0, max=4190.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [56]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    235
location       0
country      172
state        405
district     939
dtype: int64

In [80]:
# order followed remove nulls:
# country null or not india
# country null=ocean, mountain
# elevation null and district null and state null= ocean or mountain
# elevation null assigned value
# state,city,country=null=> mountains or ocean
# district null= forest/ocean/water body/mountain
# state null= forest/ocean/water body/mountain

In [63]:
df = df[(df["country"] == "India") | (df["country"].isna())]

In [64]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation    190
location       0
country      172
state        363
district     373
dtype: int64

In [65]:
df.drop(
    df[
        df["elevation"].isna() & df["district"].isna() & df["state"].isna()
    ].index,
    inplace=True,
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [66]:
df.isna().sum()

file-name      0
longitude      0
latitude       0
elevation      3
location       0
country      172
state        176
district     186
dtype: int64

In [77]:
df[df["elevation"].isna()]

Unnamed: 0,file-name,longitude,latitude,elevation,location,country,state,district
2761,92794,79.375,9.21075,,[{'address_components': [{'long_name': 'Ramesw...,India,Tamil Nadu,


In [70]:
df.reset_index(drop=True, inplace=True)

In [79]:
df.iloc[1839, 6] = "Gujarat"
df.iloc[1839, 3] = 3
df.iloc[1944, 3] = 4
df.dropna(subset=["elevation"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [82]:
df[df["state"].isna() & df["district"].notna()]["state"] = "West Bengal"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
df[(df["country"].isna()) & (df["state"].isna()) & (df["district"].isna())]

Unnamed: 0,file-name,longitude,latitude,elevation,location,country,state,district
5,329756,75.6250,32.940102,3471.0,[{'address_components': [{'long_name': '182222...,,,
17,336753,75.3125,33.564602,2317.0,[{'address_components': [{'long_name': 'Nation...,,,
26,361741,74.0625,36.062401,2102.0,[{'address_components': [{'long_name': 'Gilgit...,,,
29,345766,76.5625,34.501301,3990.0,[{'address_components': [{'long_name': '194106...,,,
53,342763,76.2500,34.189098,4201.0,[{'address_components': [{'long_name': '194109...,,,
...,...,...,...,...,...,...,...,...
2691,339778,77.8125,33.876801,3945.0,[{'address_components': [{'long_name': '194201...,,,
2710,348775,77.5000,34.813499,3268.0,[{'address_components': [{'long_name': 'Unname...,,,
2721,279928,92.8125,27.944500,4244.0,[{'address_components': [{'long_name': '790102...,,,
2736,279931,93.1250,27.944500,2362.0,[{'address_components': [{'long_name': '791118...,,,


In [88]:
df.dropna(subset=["country"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [91]:
df.dropna(subset=["state"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [94]:
df.dropna(subset=["district"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [96]:
with open(
    r"../data/rainfall-place-attr-district-raw-null-removed.pickle", "wb"
) as f:
    pickle.dump(df, f)

In [123]:
df.to_csv(r"../data/rainfall-place-attr-district.csv", index=False)

In [119]:
df = pd.read_csv(r"../data/rainfall-place-attr-district.csv")

In [122]:
df.drop(columns=['location'],inplace=True)

In [118]:
# save considered files into new folder rainfall-data-final-till-2014

In [116]:
def extract_useful(row):
    temp = pd.read_csv(
        r"../data/raw-files/rainfall-data/weatherdata-"
        + str(row["file-name"])
        + ".csv"
    )
    temp.to_csv(
        "../data/raw-files/rainfall-data-final-till-2014/weatherdata-"
        + str(row["file-name"])
        + ".csv",
        index=False,
    )

In [117]:
t = df.progress_apply(extract_useful, axis=1)

HBox(children=(FloatProgress(value=0.0, max=2615.0), HTML(value='')))


