TEST 1

In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv('Student Mental Wellness Survey.csv')
data.columns = [f'X{i}' for i in range(1, len(data.columns) + 1)]
data = data.iloc[:, 1:]
print(data.head())

   X2           X3      X4                   X5                X6         X7  \
0  20  Bhubaneswar    Male  Bachelor's 3rd Year  Computer Science   Everyday   
1  69       Liquid    Male  Bachelor's 3rd Year  Computer Science   Everyday   
2  20   Faridabad     Male  Bachelor's 3rd Year               BBA  Sometimes   
3  21          UP     Male  Bachelor's 4th Year               CSE         No   
4  56        Delhi  Female  Bachelor's 2nd Year                BA     Rarely   

         X8                  X9       X10       X11        X12       X13  \
0     Often         Weight gain     Often  Everyday   Everyday  Everyday   
1  Everyday           No change  Everyday  Everyday  Sometimes  Everyday   
2    Rarely           No change        No        No         No        No   
3        No           No change        No        No         No        No   
4    Rarely  Increased appetite    Rarely    Rarely     Rarely    Rarely   

        X14             X15      X16  X17  X18  
0    Rarely  

In [3]:
data['X3'] = data['X3'].str.strip().str.lower().replace({
    'uttar pradesh': 'Uttar Pradesh',
    'up': 'Uttar Pradesh',
    'madhya pradesh': 'Madhya Pradesh',
    'mp': 'Madhya Pradesh',
    'maharashtra': 'Maharashtra',
    'maharastra': 'Maharashtra',
    'mahrashtra': 'Maharashtra',
    'haryana': 'Haryana',
    'odisha': 'Odisha',
    'west bengal': 'West Bengal',
    'chhattisgarh': 'Chhattisgarh',
    'karnataka': 'Karnataka',
    'delhi': 'Delhi',
    'bhubaneswar': 'Odisha',
    'faridabad': 'Haryana',
    'delhi': 'Delhi',
    'rajasthan': 'Rajasthan',
    'rajisthan': 'Rajasthan',
    'tamil nadu': 'Tamil Nadu',  
    'Karnataka': 'Karnataka', 
    'delhi ncr': 'Delhi',
})

# Get the unique values in the cleaned Location column
unique_locations = data['X3'].unique()

print(unique_locations)


['Odisha' 'liquid' 'Haryana' 'Uttar Pradesh' 'Delhi' 'Rajasthan' 'bihar'
 'Maharashtra' 'West Bengal' '-' 'Madhya Pradesh' 'jharkhand' 'Tamil Nadu'
 'Chhattisgarh' 'Karnataka' 'not from india' 'gujrat' 'uttarpradesh'
 'assam' 'goa']


These columns only take 2-5 values hence one hot encoding is done

In [4]:
columns_to_encode = ['X4','X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15']
data= pd.get_dummies(data, columns=columns_to_encode)
print(data.head())

   X2             X3                   X5                X6      X16  X17  \
0  20         Odisha  Bachelor's 3rd Year  Computer Science  Nothing  Yes   
1  69         liquid  Bachelor's 3rd Year  Computer Science     nope  Yes   
2  20        Haryana  Bachelor's 3rd Year               BBA       No   No   
3  21  Uttar Pradesh  Bachelor's 4th Year               CSE      NaN  Yes   
4  56          Delhi  Bachelor's 2nd Year                BA      NaN   No   

   X18  X4_Female  X4_Male  X4_Others  ...  X13_Rarely  X13_Sometimes  \
0   No      False     True      False  ...       False          False   
1  Yes      False     True      False  ...       False          False   
2  Yes      False     True      False  ...       False          False   
3  Yes      False     True      False  ...       False          False   
4   No       True    False      False  ...        True          False   

   X14_Everyday  X14_No  X14_Often  X14_Rarely  X14_Sometimes  X15_No change  \
0         False   

These columns can take more than one value so only the top 10 are taken

In [5]:
top_categories = 10
columns_to_encode = ['X3', 'X5', 'X6']
encoded_columns = pd.DataFrame()
for column in columns_to_encode:
    top_categories_series = data[column].value_counts().nlargest(top_categories)
    top_categories_list = top_categories_series.index.tolist()
    
    encoded = pd.get_dummies(data[column].apply(lambda x: x if x in top_categories_list else 'Other'), prefix=column)
    encoded_columns = pd.concat([encoded_columns, encoded], axis=1)
data_encoded = pd.concat([data.drop(columns=columns_to_encode), encoded_columns], axis=1)
print(data_encoded.head())

   X2      X16  X17  X18  X4_Female  X4_Male  X4_Others  X4_Prefer not to say  \
0  20  Nothing  Yes   No      False     True      False                 False   
1  69     nope  Yes  Yes      False     True      False                 False   
2  20       No   No  Yes      False     True      False                 False   
3  21      NaN  Yes  Yes      False     True      False                 False   
4  56      NaN   No   No       True    False      False                 False   

   X7_Everyday  X7_No  ...  X6_B.tech  X6_BTech   X6_BTech CSE  X6_Btech  \
0         True  False  ...      False      False         False     False   
1         True  False  ...      False      False         False     False   
2        False  False  ...      False      False         False     False   
3        False   True  ...      False      False         False     False   
4        False  False  ...      False      False         False     False   

   X6_CSE  X6_Computer Science  X6_Computer Science Engi

In [6]:
def merge_columns(row):
    if row['X18'] == 'Yes':
        return True
    elif row['X17'] == 'Yes':
        return True
    else: 
        return False

# Apply the function to create the new column
data_encoded['Output'] = data_encoded.apply(merge_columns, axis=1)
data_encoded.drop(['X18', 'X17'], axis=1, inplace=True)

# Display the DataFrame with the merged column
data_encoded.head()

Unnamed: 0,X2,X16,X4_Female,X4_Male,X4_Others,X4_Prefer not to say,X7_Everyday,X7_No,X7_Often,X7_Rarely,...,X6_BTech,X6_BTech CSE,X6_Btech,X6_CSE,X6_Computer Science,X6_Computer Science Engineering,X6_Computer science,X6_Computer science engineering,X6_Other,Output
0,20,Nothing,False,True,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,69,nope,False,True,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,20,No,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
3,21,,False,True,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,True
4,56,,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False


In [7]:

data_encoded.to_csv('encoded_data.csv', index=False)
