In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from datetime import datetime

In [4]:
df = pd.read_csv('homeprices_data.csv')

Task 1: Identify missing values in the DataFrame

In [5]:
print("\nMissing values in DataFrame:\n", df.isnull().sum())


Missing values in DataFrame:
 area         1
bedrooms     0
bathrooms    2
balcony      3
price        0
dtype: int64


Task 2: Drop rows with any missing values

In [4]:
df_dropna_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:\n\n", df_dropna_rows)


DataFrame after dropping rows with missing values:

       area  bedrooms  bathrooms  balcony   price
0   1056.0         2        2.0      1.0   39.07
1   2600.0         4        3.0      3.0  120.00
3   1521.0         3        3.0      2.0   75.00
4   1521.0         3        3.0      2.0   75.00
5   1200.0         2        2.0      1.0   51.00
7   2732.0         4        2.0      3.0  135.00
8   3300.0         4        3.0      2.0  155.00
9   1310.0         3        2.0      2.0   50.00
10  3700.0         5        3.0      3.0  167.00
12  2785.0         4        2.0      3.0  140.00
14  1100.0         2        2.0      2.0   40.00
15  2250.0         3        2.0      3.0  101.00
16  2770.0         3        3.0      2.0  102.00
17  1175.0         2        1.0      2.0   42.00
18  1180.0         3        2.0      3.0   48.00
20  2770.0         3        3.0      2.0  102.00
21   800.0         1        1.0      1.0   32.00


Task 3: Drop columns with any missing values

In [5]:
df_dropna_columns = df.dropna(axis=1)
print("\nDataFrame after dropping columns with missing values:\n", df_dropna_columns)


DataFrame after dropping columns with missing values:
     bedrooms   price
0          2   39.07
1          4  120.00
2          3   62.00
3          3   75.00
4          3   75.00
5          2   51.00
6          2   38.00
7          4  135.00
8          4  155.00
9          3   50.00
10         5  167.00
11         3   82.00
12         4  140.00
13         2   38.00
14         2   40.00
15         3  101.00
16         3  102.00
17         2   42.00
18         3   48.00
19         3   60.00
20         3  102.00
21         1   32.00


Task 4: Fill missing values with a specific value (e.g., 0)

In [6]:
df_filled = df.fillna(df.mode().iloc[0])
print("\nDataFrame after filling missing values with 0:\n", df_filled)


DataFrame after filling missing values with 0:
       area  bedrooms  bathrooms  balcony   price
0   1056.0         2        2.0      1.0   39.07
1   2600.0         4        3.0      3.0  120.00
2   1440.0         3        2.0      2.0   62.00
3   1521.0         3        3.0      2.0   75.00
4   1521.0         3        3.0      2.0   75.00
5   1200.0         2        2.0      1.0   51.00
6   1170.0         2        2.0      2.0   38.00
7   2732.0         4        2.0      3.0  135.00
8   3300.0         4        3.0      2.0  155.00
9   1310.0         3        2.0      2.0   50.00
10  3700.0         5        3.0      3.0  167.00
11  1800.0         3        3.0      2.0   82.00
12  2785.0         4        2.0      3.0  140.00
13  1521.0         2        2.0      1.0   38.00
14  1100.0         2        2.0      2.0   40.00
15  2250.0         3        2.0      3.0  101.00
16  2770.0         3        3.0      2.0  102.00
17  1175.0         2        1.0      2.0   42.00
18  1180.0         3

Task 5: Fill missing values using forward fill

In [7]:
df_ffill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:\n", df_ffill)


DataFrame after forward fill:
       area  bedrooms  bathrooms  balcony   price
0   1056.0         2        2.0      1.0   39.07
1   2600.0         4        3.0      3.0  120.00
2   1440.0         3        2.0      3.0   62.00
3   1521.0         3        3.0      2.0   75.00
4   1521.0         3        3.0      2.0   75.00
5   1200.0         2        2.0      1.0   51.00
6   1170.0         2        2.0      2.0   38.00
7   2732.0         4        2.0      3.0  135.00
8   3300.0         4        3.0      2.0  155.00
9   1310.0         3        2.0      2.0   50.00
10  3700.0         5        3.0      3.0  167.00
11  1800.0         3        3.0      3.0   82.00
12  2785.0         4        2.0      3.0  140.00
13  2785.0         2        2.0      1.0   38.00
14  1100.0         2        2.0      2.0   40.00
15  2250.0         3        2.0      3.0  101.00
16  2770.0         3        3.0      2.0  102.00
17  1175.0         2        1.0      2.0   42.00
18  1180.0         3        2.0      

  df_ffill = df.fillna(method='ffill')


Task 6: Fill missing values using backward fill

In [8]:
df

Unnamed: 0,area,bedrooms,bathrooms,balcony,price
0,1056.0,2,2.0,1.0,39.07
1,2600.0,4,3.0,3.0,120.0
2,1440.0,3,2.0,,62.0
3,1521.0,3,3.0,2.0,75.0
4,1521.0,3,3.0,2.0,75.0
5,1200.0,2,2.0,1.0,51.0
6,1170.0,2,,2.0,38.0
7,2732.0,4,2.0,3.0,135.0
8,3300.0,4,3.0,2.0,155.0
9,1310.0,3,2.0,2.0,50.0


In [9]:
df_bfill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:\n", df_bfill)


DataFrame after backward fill:
       area  bedrooms  bathrooms  balcony   price
0   1056.0         2        2.0      1.0   39.07
1   2600.0         4        3.0      3.0  120.00
2   1440.0         3        2.0      2.0   62.00
3   1521.0         3        3.0      2.0   75.00
4   1521.0         3        3.0      2.0   75.00
5   1200.0         2        2.0      1.0   51.00
6   1170.0         2        2.0      2.0   38.00
7   2732.0         4        2.0      3.0  135.00
8   3300.0         4        3.0      2.0  155.00
9   1310.0         3        2.0      2.0   50.00
10  3700.0         5        3.0      3.0  167.00
11  1800.0         3        3.0      3.0   82.00
12  2785.0         4        2.0      3.0  140.00
13  1100.0         2        2.0      1.0   38.00
14  1100.0         2        2.0      2.0   40.00
15  2250.0         3        2.0      3.0  101.00
16  2770.0         3        3.0      2.0  102.00
17  1175.0         2        1.0      2.0   42.00
18  1180.0         3        2.0     

  df_bfill = df.fillna(method='bfill')


Task 7: Interpolate missing values

In [10]:
df_interpolated = df.interpolate()
print("\nDataFrame after interpolation:\n", df_interpolated)


DataFrame after interpolation:
       area  bedrooms  bathrooms  balcony   price
0   1056.0         2        2.0      1.0   39.07
1   2600.0         4        3.0      3.0  120.00
2   1440.0         3        2.0      2.5   62.00
3   1521.0         3        3.0      2.0   75.00
4   1521.0         3        3.0      2.0   75.00
5   1200.0         2        2.0      1.0   51.00
6   1170.0         2        2.0      2.0   38.00
7   2732.0         4        2.0      3.0  135.00
8   3300.0         4        3.0      2.0  155.00
9   1310.0         3        2.0      2.0   50.00
10  3700.0         5        3.0      3.0  167.00
11  1800.0         3        3.0      3.0   82.00
12  2785.0         4        2.0      3.0  140.00
13  1942.5         2        2.0      1.0   38.00
14  1100.0         2        2.0      2.0   40.00
15  2250.0         3        2.0      3.0  101.00
16  2770.0         3        3.0      2.0  102.00
17  1175.0         2        1.0      2.0   42.00
18  1180.0         3        2.0     

Task 8: Convert a column to a different data type (e.g., Area to float)

In [11]:
df['Area'] = df['area'].astype(float)
df['price'] = df['price'].astype(int)
print("\nDataFrame after converting 'Area' to float:\n", df.dtypes)


DataFrame after converting 'Area' to float:
 area         float64
bedrooms       int64
bathrooms    float64
balcony      float64
price          int64
Area         float64
dtype: object


Task 9: Apply a function to transform the values of a column (e.g., convert prices to thousands)

In [12]:
df['price'] = df['price'].apply(lambda x: x / 1000)
print("\nDataFrame after transforming 'Price' column:\n", df.head())


DataFrame after transforming 'Price' column:
      area  bedrooms  bathrooms  balcony  price    Area
0  1056.0         2        2.0      1.0  0.039  1056.0
1  2600.0         4        3.0      3.0  0.120  2600.0
2  1440.0         3        2.0      NaN  0.062  1440.0
3  1521.0         3        3.0      2.0  0.075  1521.0
4  1521.0         3        3.0      2.0  0.075  1521.0


Task 10: Normalize a column using Min-Max scaling (Price)

In [13]:
scaler = MinMaxScaler()
df['Normalized_Pr'] = scaler.fit_transform(df[['price']])
print("\nDataFrame after Min-Max normalization on 'Price':\n", df.head())


DataFrame after Min-Max normalization on 'Price':
      area  bedrooms  bathrooms  balcony  price    Area  Normalized_Pr
0  1056.0         2        2.0      1.0  0.039  1056.0       0.051852
1  2600.0         4        3.0      3.0  0.120  2600.0       0.651852
2  1440.0         3        2.0      NaN  0.062  1440.0       0.222222
3  1521.0         3        3.0      2.0  0.075  1521.0       0.318519
4  1521.0         3        3.0      2.0  0.075  1521.0       0.318519


Task 11: Standardize a column (z-score normalization on 'Price')

In [14]:
scaler = StandardScaler()
df['Standardized_Pr'] = scaler.fit_transform(df[['price']])
print("\nDataFrame after standardizing 'Price':\n", df.head())


DataFrame after standardizing 'Price':
      area  bedrooms  bathrooms  balcony  price    Area  Normalized_Pr  \
0  1056.0         2        2.0      1.0  0.039  1056.0       0.051852   
1  2600.0         4        3.0      3.0  0.120  2600.0       0.651852   
2  1440.0         3        2.0      NaN  0.062  1440.0       0.222222   
3  1521.0         3        3.0      2.0  0.075  1521.0       0.318519   
4  1521.0         3        3.0      2.0  0.075  1521.0       0.318519   

   Standardized_Pr  
0        -0.994096  
1         0.983001  
2        -0.432698  
3        -0.115386  
4        -0.115386  


Task 12: Identify duplicate rows

In [15]:
print("\nDuplicate rows in DataFrame:\n", df.duplicated().sum())


Duplicate rows in DataFrame:
 2


Task 13: Drop duplicate rows

In [16]:
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicate rows:\n", df_no_duplicates)


DataFrame after removing duplicate rows:
       area  bedrooms  bathrooms  balcony  price    Area  Normalized_Pr  \
0   1056.0         2        2.0      1.0  0.039  1056.0       0.051852   
1   2600.0         4        3.0      3.0  0.120  2600.0       0.651852   
2   1440.0         3        2.0      NaN  0.062  1440.0       0.222222   
3   1521.0         3        3.0      2.0  0.075  1521.0       0.318519   
5   1200.0         2        2.0      1.0  0.051  1200.0       0.140741   
6   1170.0         2        NaN      2.0  0.038  1170.0       0.044444   
7   2732.0         4        2.0      3.0  0.135  2732.0       0.762963   
8   3300.0         4        3.0      2.0  0.155  3300.0       0.911111   
9   1310.0         3        2.0      2.0  0.050  1310.0       0.133333   
10  3700.0         5        3.0      3.0  0.167  3700.0       1.000000   
11  1800.0         3        3.0      NaN  0.082  1800.0       0.370370   
12  2785.0         4        2.0      3.0  0.140  2785.0       0.80000

Task 14: Drop duplicate rows based on specific columns (e.g., Area and Price)

In [21]:
df_no_duplicates_subset = df.drop_duplicates(subset=['area', 'price'])
print("\nDataFrame after dropping duplicates based on 'Area' and 'Price':\n", df_no_duplicates_subset)


DataFrame after dropping duplicates based on 'Area' and 'Price':
       area  bedrooms  bathrooms  balcony  price  Normalized_Pr  \
0   1056.0         2        2.0      1.0  0.039       0.051852   
1   2600.0         4        3.0      3.0  0.120       0.651852   
2   1440.0         3        2.0      NaN  0.062       0.222222   
3   1521.0         3        3.0      2.0  0.075       0.318519   
5   1200.0         2        2.0      1.0  0.051       0.140741   
6   1170.0         2        NaN      2.0  0.038       0.044444   
7   2732.0         4        2.0      3.0  0.135       0.762963   
8   3300.0         4        3.0      2.0  0.155       0.911111   
9   1310.0         3        2.0      2.0  0.050       0.133333   
10  3700.0         5        3.0      3.0  0.167       1.000000   
11  1800.0         3        3.0      NaN  0.082       0.370370   
12  2785.0         4        2.0      3.0  0.140       0.800000   
13     NaN         2        NaN      1.0  0.038       0.044444   
14  1100.

Task 15: Convert all string values in a column to lowercase (if applicable)

In [23]:
df['balcony'] = df['balcony'].astype(str).str.lower()
print("\nDataFrame after converting 'Balcony' to lowercase:\n", df.head())


DataFrame after converting 'Balcony' to lowercase:
      area  bedrooms  bathrooms balcony  price  Normalized_Pr  Standardized_Pr
0  1056.0         2        2.0     1.0  0.039       0.051852        -0.994096
1  2600.0         4        3.0     3.0  0.120       0.651852         0.983001
2  1440.0         3        2.0     nan  0.062       0.222222        -0.432698
3  1521.0         3        3.0     2.0  0.075       0.318519        -0.115386
4  1521.0         3        3.0     2.0  0.075       0.318519        -0.115386


Task 16: Remove leading and trailing spaces from string values in a column (e.g., Balcony)

In [24]:
df['balcony'] = df['balcony'].str.strip()
print("\nDataFrame after stripping spaces in 'Balcony':\n", df.head())


DataFrame after stripping spaces in 'Balcony':
      area  bedrooms  bathrooms balcony  price  Normalized_Pr  Standardized_Pr
0  1056.0         2        2.0     1.0  0.039       0.051852        -0.994096
1  2600.0         4        3.0     3.0  0.120       0.651852         0.983001
2  1440.0         3        2.0     nan  0.062       0.222222        -0.432698
3  1521.0         3        3.0     2.0  0.075       0.318519        -0.115386
4  1521.0         3        3.0     2.0  0.075       0.318519        -0.115386


Task 17: Replace a specific substring in a column (e.g., replace 'nan' with 'NAN' in Balcony column)

In [26]:
df['balcony'] = df['balcony'].str.replace('nan', 'NAN', regex=True)
print("\nDataFrame after replacing 'nan' with 'NAN':\n", df.head())


DataFrame after replacing 'nan' with 'NAN':
      area  bedrooms  bathrooms balcony  price  Normalized_Pr  Standardized_Pr
0  1056.0         2        2.0     1.0  0.039       0.051852        -0.994096
1  2600.0         4        3.0     3.0  0.120       0.651852         0.983001
2  1440.0         3        2.0     NAN  0.062       0.222222        -0.432698
3  1521.0         3        3.0     2.0  0.075       0.318519        -0.115386
4  1521.0         3        3.0     2.0  0.075       0.318519        -0.115386


Task 18: Extract a substring from each value in a column (if applicable)

In [28]:
df['balcony_Sub'] = df['balcony'].str[:1]  # Extract first 3 characters
print("\nExtracted substring from 'Balcony':\n", df)


Extracted substring from 'Balcony':
       area  bedrooms  bathrooms balcony  price  Normalized_Pr  \
0   1056.0         2        2.0     1.0  0.039       0.051852   
1   2600.0         4        3.0     3.0  0.120       0.651852   
2   1440.0         3        2.0     NAN  0.062       0.222222   
3   1521.0         3        3.0     2.0  0.075       0.318519   
4   1521.0         3        3.0     2.0  0.075       0.318519   
5   1200.0         2        2.0     1.0  0.051       0.140741   
6   1170.0         2        NaN     2.0  0.038       0.044444   
7   2732.0         4        2.0     3.0  0.135       0.762963   
8   3300.0         4        3.0     2.0  0.155       0.911111   
9   1310.0         3        2.0     2.0  0.050       0.133333   
10  3700.0         5        3.0     3.0  0.167       1.000000   
11  1800.0         3        3.0     NAN  0.082       0.370370   
12  2785.0         4        2.0     3.0  0.140       0.800000   
13     NaN         2        NaN     1.0  0.038      

In [29]:
df =  df.drop(columns= 'balcony_Sub')
df.head()

Unnamed: 0,area,bedrooms,bathrooms,balcony,price,Normalized_Pr,Standardized_Pr
0,1056.0,2,2.0,1.0,0.039,0.051852,-0.994096
1,2600.0,4,3.0,3.0,0.12,0.651852,0.983001
2,1440.0,3,2.0,NAN,0.062,0.222222,-0.432698
3,1521.0,3,3.0,2.0,0.075,0.318519,-0.115386
4,1521.0,3,3.0,2.0,0.075,0.318519,-0.115386


In [49]:
df = pd.read_csv('Microsoft_Stock.csv')

Extract column names

In [41]:
column_names = df.columns.tolist()
print("Column Names:", column_names)

Column Names: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']


Task 19: Convert a column to datetime format (if 'Date' column exists)

In [42]:
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    print("\nDataFrame after converting 'Date' to datetime format:\n", df.head())


DataFrame after converting 'Date' to datetime format:
                  Date   Open   High    Low  Close    Volume
0 2015-04-01 16:00:00  40.60  40.76  40.31  40.72  36865322
1 2015-04-02 16:00:00  40.66  40.74  40.12  40.29  37487476
2 2015-04-06 16:00:00  40.34  41.78  40.18  41.55  39223692
3 2015-04-07 16:00:00  41.61  41.91  41.31  41.53  28809375
4 2015-04-08 16:00:00  41.48  41.69  41.04  41.42  24753438


Task 20: Extract year, month, and day from a datetime column (if 'Date' column exists)

In [45]:
if 'Date' in df.columns:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    print("\nExtracted Year, Month, and Day from 'Date':\n", df.head())


Extracted Year, Month, and Day from 'Date':
                  Date   Open   High    Low  Close    Volume  Year  Month  Day
0 2015-04-01 16:00:00  40.60  40.76  40.31  40.72  36865322  2015      4    1
1 2015-04-02 16:00:00  40.66  40.74  40.12  40.29  37487476  2015      4    2
2 2015-04-06 16:00:00  40.34  41.78  40.18  41.55  39223692  2015      4    6
3 2015-04-07 16:00:00  41.61  41.91  41.31  41.53  28809375  2015      4    7
4 2015-04-08 16:00:00  41.48  41.69  41.04  41.42  24753438  2015      4    8


Task 21: Filter rows based on a date range (if 'Date' column exists)

In [48]:
if 'Date' in df.columns:
    df_date_filtered = df[(df['Date'] >= '2015-04-21') & (df['Date'] <= '2015-05-02')]
    print("\nDataFrame filtered by date range:\n", df_date_filtered)


DataFrame filtered by date range:
                   Date   Open   High    Low  Close     Volume  Year  Month  \
13 2015-04-21 16:00:00  43.00  43.15  42.53  42.64   26013844  2015      4   
14 2015-04-22 16:00:00  42.67  43.13  42.55  42.99   25064273  2015      4   
15 2015-04-23 16:00:00  42.85  43.61  42.80  43.34   46309530  2015      4   
16 2015-04-24 16:00:00  45.66  48.14  45.65  47.87  130933665  2015      4   
17 2015-04-27 16:00:00  47.23  48.13  47.22  48.03   59248172  2015      4   
18 2015-04-28 16:00:00  47.78  49.21  47.70  49.16   60730778  2015      4   
19 2015-04-29 16:00:00  48.72  49.31  48.50  49.06   47804562  2015      4   
20 2015-04-30 16:00:00  48.70  49.54  48.60  48.64   64725457  2015      4   
21 2015-05-01 16:00:00  48.58  48.88  48.40  48.66   38937336  2015      5   

    Day  
13   21  
14   22  
15   23  
16   24  
17   27  
18   28  
19   29  
20   30  
21    1  


In [65]:
df = pd.read_csv('categorical_data.csv')
column_names = df.columns.tolist()
print("Column Names:", column_names)

Column Names: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Task 22: Convert a categorical column to numerical using one-hot encoding (e.g., Balcony)

In [67]:
df_one_hot = pd.get_dummies(df, columns=['Sex'])
sex_cols = [col for col in df_one_hot.columns if col.startswith('Sex_')]
df_one_hot[sex_cols] = df_one_hot[sex_cols].astype(int)
print("\nDataFrame after one-hot encoding 'Sex':\n", df_one_hot[['PassengerId', 'Survived', 'Age', 'Sex_female', 'Sex_male']].head(10))



DataFrame after one-hot encoding 'Sex':
    PassengerId  Survived   Age  Sex_female  Sex_male
0            1         0  22.0           0         1
1            2         1  38.0           1         0
2            3         1  26.0           1         0
3            4         1  35.0           1         0
4            5         0  35.0           0         1
5            6         0   NaN           0         1
6            7         0  54.0           0         1
7            8         0   2.0           0         1
8            9         1  27.0           1         0
9           10         1  14.0           1         0


Task 23: Convert a categorical column to numerical using label encoding (e.g., Balcony)

In [68]:
encoder = LabelEncoder()
df['Sex_Label'] = encoder.fit_transform(df['Sex'])
print("\nDataFrame after label encoding 'Sex':\n", df[['PassengerId', 'Survived', 'Age', 'Sex_Label']].head())


DataFrame after label encoding 'Sex':
    PassengerId  Survived   Age  Sex_Label
0            1         0  22.0          1
1            2         1  38.0          0
2            3         1  26.0          0
3            4         1  35.0          0
4            5         0  35.0          1


Task 24: Group values in a categorical column and create a new column with grouped categories

In [7]:
df = pd.read_csv('homeprices_data.csv')
column_names = df.columns.tolist()
print("Column Names:", column_names)

Column Names: ['area', 'bedrooms', 'bathrooms', 'balcony', 'price']


In [11]:
df['Bedrooms_Group'] = pd.cut(df['bedrooms'], bins=[0, 2, 3, 5], labels=['Small', 'Medium', 'Large'])
print("\nDataFrame with grouped 'Bedrooms':\n", df.head(18))


DataFrame with grouped 'Bedrooms':
       area  bedrooms  bathrooms  balcony   price Bedrooms_Group
0   1056.0         2        2.0      1.0   39.07          Small
1   2600.0         4        3.0      3.0  120.00          Large
2   1440.0         3        2.0      NaN   62.00         Medium
3   1521.0         3        3.0      2.0   75.00         Medium
4   1521.0         3        3.0      2.0   75.00         Medium
5   1200.0         2        2.0      1.0   51.00          Small
6   1170.0         2        NaN      2.0   38.00          Small
7   2732.0         4        2.0      3.0  135.00          Large
8   3300.0         4        3.0      2.0  155.00          Large
9   1310.0         3        2.0      2.0   50.00         Medium
10  3700.0         5        3.0      3.0  167.00          Large
11  1800.0         3        3.0      NaN   82.00         Medium
12  2785.0         4        2.0      3.0  140.00          Large
13     NaN         2        NaN      1.0   38.00          Small
14 

Task 25: Merge two DataFrames based on a common column

In [17]:
df2 = pd.DataFrame({'area': [1200, 2600, 3300], 'Nearby Schools': [3, 5, 2]})
df_merged = pd.merge(df,df2, on='area', how='left')
print("\nMerged DataFrame:\n", df_merged)


Merged DataFrame:
       area  bedrooms  bathrooms  balcony   price Bedrooms_Group  \
0   1056.0         2        2.0      1.0   39.07          Small   
1   2600.0         4        3.0      3.0  120.00          Large   
2   1440.0         3        2.0      NaN   62.00         Medium   
3   1521.0         3        3.0      2.0   75.00         Medium   
4   1521.0         3        3.0      2.0   75.00         Medium   
5   1200.0         2        2.0      1.0   51.00          Small   
6   1170.0         2        NaN      2.0   38.00          Small   
7   2732.0         4        2.0      3.0  135.00          Large   
8   3300.0         4        3.0      2.0  155.00          Large   
9   1310.0         3        2.0      2.0   50.00         Medium   
10  3700.0         5        3.0      3.0  167.00          Large   
11  1800.0         3        3.0      NaN   82.00         Medium   
12  2785.0         4        2.0      3.0  140.00          Large   
13     NaN         2        NaN      1.0  

Task 26: Concatenate two DataFrames vertically

In [18]:
df_concat_vertical = pd.concat([df, df2], axis=0, ignore_index=True)
print("\nDataFrame after vertical concatenation:\n", df_concat_vertical)


DataFrame after vertical concatenation:
       area  bedrooms  bathrooms  balcony   price Bedrooms_Group  \
0   1056.0       2.0        2.0      1.0   39.07          Small   
1   2600.0       4.0        3.0      3.0  120.00          Large   
2   1440.0       3.0        2.0      NaN   62.00         Medium   
3   1521.0       3.0        3.0      2.0   75.00         Medium   
4   1521.0       3.0        3.0      2.0   75.00         Medium   
5   1200.0       2.0        2.0      1.0   51.00          Small   
6   1170.0       2.0        NaN      2.0   38.00          Small   
7   2732.0       4.0        2.0      3.0  135.00          Large   
8   3300.0       4.0        3.0      2.0  155.00          Large   
9   1310.0       3.0        2.0      2.0   50.00         Medium   
10  3700.0       5.0        3.0      3.0  167.00          Large   
11  1800.0       3.0        3.0      NaN   82.00         Medium   
12  2785.0       4.0        2.0      3.0  140.00          Large   
13     NaN       2.0

Task 27: Concatenate two DataFrames horizontally

In [19]:
df_concat_horizontal = pd.concat([df, df2], axis=1)
print("\nDataFrame after horizontal concatenation:\n", df_concat_horizontal)


DataFrame after horizontal concatenation:
       area  bedrooms  bathrooms  balcony   price Bedrooms_Group    area  \
0   1056.0         2        2.0      1.0   39.07          Small  1200.0   
1   2600.0         4        3.0      3.0  120.00          Large  2600.0   
2   1440.0         3        2.0      NaN   62.00         Medium  3300.0   
3   1521.0         3        3.0      2.0   75.00         Medium     NaN   
4   1521.0         3        3.0      2.0   75.00         Medium     NaN   
5   1200.0         2        2.0      1.0   51.00          Small     NaN   
6   1170.0         2        NaN      2.0   38.00          Small     NaN   
7   2732.0         4        2.0      3.0  135.00          Large     NaN   
8   3300.0         4        3.0      2.0  155.00          Large     NaN   
9   1310.0         3        2.0      2.0   50.00         Medium     NaN   
10  3700.0         5        3.0      3.0  167.00          Large     NaN   
11  1800.0         3        3.0      NaN   82.00        

Task 28: Create a new column based on existing columns (e.g., total rooms = bedrooms + bathrooms)

In [23]:
df['Total_Rooms'] = df['bedrooms'] + df['bathrooms']
print("\nDataFrame with 'Total_Rooms' column:\n", df.head())


DataFrame with 'Total_Rooms' column:
      area  bedrooms  bathrooms  balcony   price Bedrooms_Group  Total_Rooms
0  1056.0         2        2.0      1.0   39.07          Small          4.0
1  2600.0         4        3.0      3.0  120.00          Large          7.0
2  1440.0         3        2.0      NaN   62.00         Medium          5.0
3  1521.0         3        3.0      2.0   75.00         Medium          6.0
4  1521.0         3        3.0      2.0   75.00         Medium          6.0


Task 29: Discretize a continuous column into bins (e.g., price into categories)

In [24]:
df['Price_Category'] = pd.cut(df['price'], bins=[0, 50, 100, 150], labels=['Low', 'Medium', 'High'])
print("\nDataFrame with discretized 'Price' column:\n", df.head())


DataFrame with discretized 'Price' column:
      area  bedrooms  bathrooms  balcony   price Bedrooms_Group  Total_Rooms  \
0  1056.0         2        2.0      1.0   39.07          Small          4.0   
1  2600.0         4        3.0      3.0  120.00          Large          7.0   
2  1440.0         3        2.0      NaN   62.00         Medium          5.0   
3  1521.0         3        3.0      2.0   75.00         Medium          6.0   
4  1521.0         3        3.0      2.0   75.00         Medium          6.0   

  Price_Category  
0            Low  
1           High  
2         Medium  
3         Medium  
4         Medium  


Task 30: Create polynomial features from existing numerical columns (e.g., 'Area' and 'Bedrooms')

In [31]:
df['Bedrooms_Squared'] = df['bedrooms'] ** 2
df['Bathrooms_Squared'] = df['bathrooms'] ** 2
print("\nDataFrame with polynomial features:\n", df.head())


DataFrame with polynomial features:
      area  bedrooms  bathrooms  balcony   price Bedrooms_Group  Total_Rooms  \
0  1056.0         2        2.0      1.0   39.07          Small          4.0   
1  2600.0         4        3.0      3.0  120.00          Large          7.0   
2  1440.0         3        2.0      NaN   62.00         Medium          5.0   
3  1521.0         3        3.0      2.0   75.00         Medium          6.0   
4  1521.0         3        3.0      2.0   75.00         Medium          6.0   

  Price_Category  Area_Squared  Bedrooms_Squared  Bathrooms_Squared  
0            Low     1115136.0                 4                4.0  
1           High     6760000.0                16                9.0  
2         Medium     2073600.0                 9                4.0  
3         Medium     2313441.0                 9                9.0  
4         Medium     2313441.0                 9                9.0  
