In [None]:
# Program to read the data from the csv file and calculate the mean, median and mode of the data saved in the csv file
'''data columns are: Index(['Day', 'Month', 'Year', 'States/UTs', 'Rice', 'Wheat', 'Atta (Wheat)',
       'Gram Dal', 'Tur/Arhar Dal', 'Urad Dal', 'Moong Dal', 'Masoor Dal',
       'Sugar', 'Milk @', 'Groundnut Oil (Packed)', 'Mustard Oil (Packed)',
       'Vanaspati (Packed)', 'Soya Oil (Packed)', 'Sunflower Oil (Packed)',
       'Palm Oil (Packed)', 'Gur', 'Tea Loose', 'Salt Pack (Iodised)',
       'Potato', 'Onion', 'Tomato'],
      dtype='object')'''


In [3]:
# Importing the required libraries
import statistics
import numpy as np
import pandas as pd

In [14]:
# Function to calculate the mean, mode and median of the data
def calculate_daily_stat(data, days):
    results_list = []
    for i in range(days.shape[0]):
        temp = data[(data['Day'] == days['Day'][i]) & 
                    (data['Month'] == days['Month'][i]) & 
                    (data['Year'] == days['Year'][i])]

        # Calculate mean for each column
        daily_result = {}
        for j in temp.columns:
            temp1 = temp[j].dropna()
            if temp1.dtype == 'int64' or temp1.dtype == 'float64':
                daily_result[j] = temp1.mean()
            else:
                daily_result[j] = 'mean'  # Placeholder for non-numerical columns
        results_list.append(daily_result)

        # Calculate mode for each column
        daily_result = {}
        for j in temp.columns:
            temp1 = temp[j].dropna()
            if temp1.dtype == 'int64' or temp1.dtype == 'float64':
                daily_result[j] = statistics.mode(temp1) #default mode function does not work for multiple modes
            else:
                daily_result[j] = 'mode'  # Placeholder for non-numerical columns
        results_list.append(daily_result)

        # Calculate median for each column
        daily_result = {}
        for j in temp.columns:
            temp1 = temp[j].dropna()
            if temp1.dtype == 'int64' or temp1.dtype == 'float64':
                daily_result[j] = np.median(temp1)
            else:
                daily_result[j] = 'median'  # Placeholder for non-numerical columns
        results_list.append(daily_result)

    return results_list

In [4]:
# Function to count the number of data types in each row
def count_datatypes_per_row(df):
  counts = []
  for _, row in df.iterrows():
    type_counts = {}
    for value in row.values:
      value_type = type(value).__name__  # Get the name of the data type
      type_counts[value_type] = type_counts.get(value_type, 0) + 1
    counts.append(type_counts)

  return pd.DataFrame(counts, index=df.index)

In [None]:
# defining the path of the csv files
path_r='data/cleansing/data_r.csv'
path_w='data/cleansing/data_w.csv'
op_path_r='data/cleansing/stat_r.csv'
op_path_w='data/cleansing/stat_w.csv'

# reading the data from the csv files and storing it in the dataframes
data_r=pd.read_csv(path_r)
data_w=pd.read_csv(path_w)

# create a new dataframe of unique days from the dataframes
days_r=data_r[['Day', 'Month', 'Year']].drop_duplicates().reset_index(drop=True)
days_w=data_w[['Day', 'Month', 'Year']].drop_duplicates().reset_index(drop=True)

# print shape of 4 dataframes
print('data_r shape:',data_r.shape)
print('data_w shape:',data_w.shape)
print('days_r shape:',days_r.shape)
print('days_w shape:',days_w.shape)


In [4]:
# call the function to calculate the mean, mode and median of each day
results_list_r = calculate_daily_stat(data_r, days_r)
results_list_w = calculate_daily_stat(data_w, days_w)

In [None]:
# Convert the list of daily results to a DataFrame
result_r = pd.DataFrame(results_list_r)
result_w = pd.DataFrame(results_list_w)

# Save the DataFrame to a CSV file
result_r.to_csv(op_path_r, index=False)
result_w.to_csv(op_path_w, index=False)

In [7]:
# count the number of data types in each row
result_r_counts = count_datatypes_per_row(data_r)
result_w_counts = count_datatypes_per_row(data_w)

In [None]:
# print the data types count
print('result_r_counts:',result_r_counts)
print('result_w_counts:',result_w_counts)

In [11]:
# convert the data types count to csv file if required
result_r_counts.to_csv('data/cleansing/result_r_counts.csv', index=False)
result_w_counts.to_csv('data/cleansing/result_w_counts.csv', index=False)