# a) Creation and Loading of Different Types of Datasets

## i. Creation using pandas

In [28]:
import pandas as pd

data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'], 
'City': ['Delhi', 'Mumbai', 'Bangalore', 'Chennai'],
'Age': [25, 30, 22, 28],
}
df = pd.DataFrame(data) 
print("Created DataFrame:\n", df)

Created DataFrame:
       Name       City  Age
0    Alice      Delhi   25
1      Bob     Mumbai   30
2  Charlie  Bangalore   22
3    David    Chennai   28


## ii. Loading CSV Dataset using pandas

In [29]:
df.to_csv("students.csv", index=False)
df_csv = pd.read_csv("students.csv") 
print("Loaded CSV DataFrame:\n", df_csv)

Loaded CSV DataFrame:
       Name       City  Age
0    Alice      Delhi   25
1      Bob     Mumbai   30
2  Charlie  Bangalore   22
3    David    Chennai   28


## iii. Loading datasets using sklearn

In [30]:
from sklearn import datasets 
import pandas as pd

iris = datasets.load_iris()

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

iris_df['target'] = iris.target

print("Iris Dataset Head:\n", iris_df.head())


Iris Dataset Head:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


# b) Compute Mean, Median, Mode, Variance, Standard Deviation

In [31]:
import numpy as np 
from scipy import stats

sample_data = [10, 20, 30, 40, 50, 50]

print("Mean:", np.mean(sample_data))
print("Median:", np.median(sample_data))

mode_result = stats.mode(sample_data, keepdims=True)
print("Mode:", mode_result.mode[0])

print("Variance:", np.var(sample_data))
print("Standard Deviation:", np.std(sample_data))


Mean: 33.333333333333336
Median: 35.0
Mode: 50
Variance: 222.22222222222226
Standard Deviation: 14.9071198499986


# c) Data Preprocessing Techniques

## i. Reshaping the data

In [32]:
import numpy as np
arr=np.array([[1, 2, 3], [4, 5, 6]])
reshaped = arr.reshape(3, 2)
print("Reshaped Data:\n", reshaped)

Reshaped Data:
 [[1 2]
 [3 4]
 [5 6]]


## ii.Filtering the data

In [33]:
filtered_df=df[df['Age'] > 25]
print("Filtered Data:\n", filtered_df)

Filtered Data:
     Name     City  Age
1    Bob   Mumbai   30
3  David  Chennai   28


## iii. Merging the data

In [34]:
df_extra= pd.DataFrame({ 'Name': ['Alice', 'Bob'], 'Score': [85, 90]
})
merged_df = pd.merge(df, df_extra, on='Name')
print("Merged DataFrame:\n", merged_df)

Merged DataFrame:
     Name    City  Age  Score
0  Alice   Delhi   25     85
1    Bob  Mumbai   30     90


## iv. Handling Missing Values

In [35]:
import pandas as pd

data_missing = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Marks': [85, None, 78]
}

df_missing = pd.DataFrame(data_missing)

df_missing['Marks'].fillna(df_missing['Marks'].mean(), inplace=True)

print("Missing Values Handled:\n", df_missing)


Missing Values Handled:
       Name  Marks
0    Alice   85.0
1      Bob   81.5
2  Charlie   78.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_missing['Marks'].fillna(df_missing['Marks'].mean(), inplace=True)


## v. Feature Normalization: Min-Max Normalization

In [36]:
from sklearn.preprocessing import MinMaxScaler
data = pd.DataFrame({'Marks': [50, 60, 70, 80, 90]}) 
scaler = MinMaxScaler()
data['Normalized'] = scaler.fit_transform(data[['Marks']]) 
print("Min-Max Normalization:\n", data)

Min-Max Normalization:
    Marks  Normalized
0     50        0.00
1     60        0.25
2     70        0.50
3     80        0.75
4     90        1.00
