In [None]:
!git clone https://github.com/dcs-sastra/Kosaksi-Pasapugazh-and-experiments.git

Cloning into 'Kosaksi-Pasapugazh-and-experiments'...
remote: Enumerating objects: 590, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 590 (delta 36), reused 39 (delta 12), pack-reused 495[K
Receiving objects: 100% (590/590), 123.88 MiB | 12.15 MiB/s, done.
Resolving deltas: 100% (414/414), done.
Updating files: 100% (478/478), done.


**Data Processing Summary:**

- Read 'All_India.csv' files for years 2008-2016 from specified directories.
- Filtered rows to keep only 'TOTAL' entries.
- Renamed and standardized column names.
- Added a 'year' column for each dataset.
- Removed rows for "M O Defence" and "M O Railways".
- Added a 'district' column to match the specific states.
- Combined all yearly data into a single DataFrame.
- Saved the combined data as 'combined_all_india.csv'.

In [None]:
import os
import pandas as pd
import glob
import IPython.display as ipd

# Define the base directory where your year folders are located
base_dir = "/content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS"

# Initialize an empty list to store individual dataframes
all_data = []

# Loop through the years from 2007 to 2020
for year in range(2008, 2017):
    # Construct the file path
    file_path = os.path.join(base_dir, f"{year}-{year+1}", f"{year}-{year+1}", "All_India.csv")

    # Check if the file exists
    if os.path.exists(file_path):
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Add a 'Year' column
        df['year'] = f"{year}-{year+1}"

        # Drop rows with stock details
        df = df[df['Unnamed: 3'] == 'TOTAL']

        # Rename columns, Drop unnecessary rows with header
        df = df.rename(columns={
          'Unnamed: 0': 'state',
          'Unnamed: 1': 'medical_code',
          'Unnamed: 2': 'medical_description'
        })

        # Drop unnecessary columns
        df = df.drop(['Unnamed: 3'], axis=1)

        # Rename Column names to lower case
        df.columns = df.columns.str.lower()

        # Remove rows for "All_India", "M O Defence", and "M O Railways"
        df = df[~df['state'].isin(["M O Defence", "M O Railways"])]

        # Append the dataframe to the list
        all_data.append(df)
    else:
        print(f"File not found for year {year}-{year+1}")

# Concatenate all dataframes in the list
combined_data = pd.concat(all_data, ignore_index=True)

# Save the combined data to a new CSV file
combined_data.to_csv("combined_all_india.csv", index=False)

print("Data consolidation complete. Output saved as 'combined_all_india.csv'")

Data consolidation complete. Output saved as 'combined_all_india.csv'


In [None]:
newdata = pd.read_csv("/content/combined_all_india.csv")
ipd.display(newdata)

Unnamed: 0,state,medical_code,medical_description,april,may,june,july,august,september,october,november,december,january,february,march,total,year
0,A & N Islands,1.1,Total number of pregnant women Registered for ANC,1621.0,1047.0,559.0,690.0,547.0,,1397.0,1338.0,,,,,7199.0,2008-2009
1,A & N Islands,1.1.1,Of which Number registered within first trimes...,,954.0,487.0,558.0,537.0,,1303.0,1280.0,,,,,5119.0,2008-2009
2,A & N Islands,1.2,Number of Pregnant women registered under JSY,,,,,,,,,,,,,,2008-2009
3,A & N Islands,1.3,Number of pregnant women received 3 ANC check ...,,,,,,,,,,,,,,2008-2009
4,A & N Islands,1.4.1,Number of pregnant women given TT1 during curr...,,,,,,,,,,,,,,2008-2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62947,West Bengal,15.1.1.b,"Out of the total number of Hb tests conducted,...",4353.0,4846.0,5089.0,4351.0,6501.0,6323.0,6836.0,6867.0,6350.0,7062.0,6383.0,9346.0,74307.0,2016-2017
62948,West Bengal,15.4.1,Number of blood smears examined for Malaria,440200.0,496443.0,539475.0,636864.0,746746.0,660014.0,522969.0,625486.0,521306.0,437223.0,453042.0,530880.0,6610648.0,2016-2017
62949,West Bengal,15.4.2,"Out of blood smears examined for malaria, numb...",485.0,470.0,490.0,1063.0,1423.0,1284.0,1237.0,3429.0,1469.0,364.0,887.0,637.0,13238.0,2016-2017
62950,West Bengal,15.4.3,"Out of blood smears examined for malaria, numb...",308.0,136.0,264.0,477.0,792.0,442.0,296.0,594.0,389.0,287.0,69.0,417.0,4471.0,2016-2017


With Districts

In [None]:
import os
import pandas as pd
import glob
import IPython.display as ipd

# Define the base directory where your year folders are located
base_dir = "/content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS"

# Initialize an empty list to store individual dataframes
all_data = []

# Loop through the years from 2008 to 2017
for year in range(2008, 2017):
    year_folder = f"{year}-{year + 1}"
    sub_dir = os.path.join(base_dir, year_folder, year_folder)

    print(f"Processing year: {year_folder}")

    if os.path.exists(sub_dir):
        csv_files = glob.glob(os.path.join(sub_dir, "*.csv"))

        print(f"Found {len(csv_files)} CSV files in {sub_dir}")

        for file_path in csv_files:
            file_name = os.path.basename(file_path)
            state_name = file_name.replace(".csv", "")

            if state_name in ["M O Defence", "M O Railways"]:
                continue

            try:
                df = pd.read_csv(file_path)

                # Drop rows with stock details
                if 'Unnamed: 3' not in df.columns:
                    print(f"Skipping file {file_path} as 'Unnamed: 3' is not present.")
                    continue

                df = df[df['Unnamed: 3'] == 'TOTAL']
                df = df.drop(columns=['Unnamed: 3'])

                # Rename columns, Drop unnecessary rows with header
                df.columns = ['district', 'medical_code', 'medical_description', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'january', 'february', 'march', 'total']

                # Add year and state columns
                df.insert(1, 'state', state_name)
                df.insert(0, 'year', year_folder)

                # Append the dataframe to the list
                all_data.append(df)

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

# Concatenate all dataframes in the list
if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)
    # Save the combined data to a new CSV file
    combined_data.to_csv("combined_all_india.csv", index=False)
    print("Data consolidation complete. Output saved as 'combined_all_india.csv'")
else:
    print("No data to concatenate. Please check your file paths and data.")


Processing year: 2008-2009
Found 38 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2008-2009/2008-2009
Processing year: 2009-2010
Found 38 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2009-2010/2009-2010
Processing year: 2010-2011
Found 38 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2010-2011/2010-2011
Processing year: 2011-2012
Found 38 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2011-2012/2011-2012
Processing year: 2012-2013
Found 38 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2012-2013/2012-2013
Processing year: 2013-2014
Found 38 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2013-2014/2013-2014
Processing year: 2014-2015
Found 39 CSV files in /content/Kosaksi-Pasapugazh-and-experiments/data/raw/Health - HMIS/2014-2015/2014-2015
Processing year: 2015-2016
Found 39 CSV files in

In [None]:
newdata = pd.read_csv("/content/combined_all_india.csv")
ipd.display(newdata)

Unnamed: 0,year,district,state,medical_code,medical_description,april,may,june,july,august,september,october,november,december,january,february,march,total
0,2008-2009,Araria,Bihar,1.1,Total number of pregnant women Registered for ANC,1521.0,,1901.0,2016.0,2453.0,3642.0,4213.0,3388.0,,4016.0,2836.0,2379.0,28365.0
1,2008-2009,Araria,Bihar,1.1.1,Of which Number registered within first trimes...,1521.0,1886.0,1515.0,1827.0,2254.0,3486.0,3919.0,3388.0,,3830.0,2771.0,2220.0,28617.0
2,2008-2009,Araria,Bihar,1.2,Number of Pregnant women registered under JSY,212.0,1576.0,1515.0,1827.0,2254.0,3486.0,3919.0,3025.0,,3830.0,2771.0,2220.0,26635.0
3,2008-2009,Araria,Bihar,1.3,Number of pregnant women received 3 ANC check ...,1733.0,1576.0,1515.0,1827.0,2254.0,3486.0,3919.0,3035.0,,3830.0,,2220.0,25395.0
4,2008-2009,Araria,Bihar,1.4.1,Number of pregnant women given TT1 during curr...,1733.0,1886.0,1901.0,2016.0,2453.0,3642.0,3072.0,3012.0,,4016.0,2836.0,2378.0,28945.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209481,2016-2017,West Bengal,All_India,15.1.1.b,"Out of the total number of Hb tests conducted,...",4353.0,4846.0,5089.0,4351.0,6501.0,6323.0,6836.0,6867.0,6350.0,7062.0,6383.0,9346.0,74307.0
1209482,2016-2017,West Bengal,All_India,15.4.1,Number of blood smears examined for Malaria,440200.0,496443.0,539475.0,636864.0,746746.0,660014.0,522969.0,625486.0,521306.0,437223.0,453042.0,530880.0,6610648.0
1209483,2016-2017,West Bengal,All_India,15.4.2,"Out of blood smears examined for malaria, numb...",485.0,470.0,490.0,1063.0,1423.0,1284.0,1237.0,3429.0,1469.0,364.0,887.0,637.0,13238.0
1209484,2016-2017,West Bengal,All_India,15.4.3,"Out of blood smears examined for malaria, numb...",308.0,136.0,264.0,477.0,792.0,442.0,296.0,594.0,389.0,287.0,69.0,417.0,4471.0
