<a href="https://colab.research.google.com/github/chaitra0312/ML-workshop-/blob/main/DataFrame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create a DataFrame and add a Bonus column

In [1]:
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Carol'],
    'Age': [24, 27, 22],
    'Salary': [50000, 55000, 48000]
}
df1=pd.DataFrame(data)
df1['Bonus']=df1['Salary']*0.1
print(df1)

    Name  Age  Salary   Bonus
0  Alice   24   50000  5000.0
1    Bob   27   55000  5500.0
2  Carol   22   48000  4800.0


# Display first two rows and compute mean salary

In [3]:
df1.head(2)
print(df1)
mean=df1['Salary'].mean()
print(mean)

    Name  Age  Salary   Bonus
0  Alice   24   50000  5000.0
1    Bob   27   55000  5500.0
2  Carol   22   48000  4800.0
51000.0


# Filter rows based on Age and Salary

In [5]:
Age=df1[df1['Age']>25]
print(Age)
Salary=df1[df1['Salary']>5000]
print(Salary)

  Name  Age  Salary   Bonus
1  Bob   27   55000  5500.0
    Name  Age  Salary   Bonus
0  Alice   24   50000  5000.0
1    Bob   27   55000  5500.0
2  Carol   22   48000  4800.0


# Handle missing values

In [8]:
data_missing = {
    'A': [1, 2, None, 4],
    'B': [None, 2, 3, None],
    'C': [1, 2, 3, 4]
}
df2=pd.DataFrame(data_missing)
df2.fillna(0,inplace=True)
df2.dropna(inplace=True)
print(df2)

     A    B  C
0  1.0  0.0  1
1  2.0  2.0  2
2  0.0  3.0  3
3  4.0  0.0  4


# Grouping and Aggregation

In [12]:
data_employees = {
    'Department': ['HR', 'HR', 'IT', 'IT', 'Finance'],
    'Employee': ['Alice', 'Bob', 'Carol', 'David', 'Eve'],
    'Salary': [50000, 45000, 60000, 65000, 70000]
}
df3=pd.DataFrame(data_employees)
grouped=df3.groupby('Department')['Salary'].agg(['sum','mean'])
print(grouped)


               sum     mean
Department                 
Finance      70000  70000.0
HR           95000  47500.0
IT          125000  62500.0


# Perform an inner join

In [13]:
df4 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Carol']})
df5 = pd.DataFrame({'ID': [2, 3, 4], 'Age': [25, 30, 22]})
join=pd.merge(df4,df5,on='ID',how='inner')
print(join)

   ID   Name  Age
0   2    Bob   25
1   3  Carol   30


# Clean dataset and display statistics

In [14]:
print(df5.describe())

        ID        Age
count  3.0   3.000000
mean   3.0  25.666667
std    1.0   4.041452
min    2.0  22.000000
25%    2.5  23.500000
50%    3.0  25.000000
75%    3.5  27.500000
max    4.0  30.000000


# EDA on Titanic datase,Load Titanic dataset

In [17]:
titanic=pd.read_csv('/content/Titanic-Dataset.csv')
print(titanic)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

# Missing values

In [18]:
missing=titanic.isnull().sum()
print(missing)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


# Average age

In [20]:
age=titanic['Age'].mean()
print(age)

29.69911764705882


# Survival rate by gender

In [21]:
gender=titanic.groupby('Sex')['Survived'].mean()
print(gender)

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64


# Data Transformation,Example temperature dataset

In [30]:
temp_data = {
    'Date': ['2024-12-20', '2024-12-21', '2024-12-22'],
    'Temperature': [15, 25, 5]
}
temp=pd.DataFrame(temp_data)

# Convert Date to DateTime

In [31]:
temp['Date']=pd.to_datetime(temp['Date'])
print(temp)

        Date  Temperature
0 2024-12-20           15
1 2024-12-21           25
2 2024-12-22            5


# Categorize temperatures

In [28]:
def categorize_temp(temp):
  if temp >= 20:
    return 'High'
  elif 10 <= temp < 20:
    return 'Medium'
  else:
    return 'Low'
temp['Catagory']=temp['Temperature'].apply(categorize_temp)
print(temp)

        Date  Temperature Catagory
0 2024-12-20           15   Medium
1 2024-12-21           25     High
2 2024-12-22            5      Low
