In [1]:
import pandas as pd
import numpy as np

In [3]:
# Creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Boston'],
    'Salary': [50000, 70000, 80000, 65000, 45000]
}

In [5]:
data

{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 'Age': [25, 30, 35, 28, 22],
 'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Boston'],
 'Salary': [50000, 70000, 80000, 65000, 45000]}

In [7]:
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
      Name  Age           City  Salary
0    Alice   25       New York   50000
1      Bob   30  San Francisco   70000
2  Charlie   35    Los Angeles   80000
3    David   28        Chicago   65000
4      Eva   22         Boston   45000


In [34]:
# Basic information about the DataFrame
print("\nDataFrame Info:")
df.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Salary  5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes


In [11]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
             Age        Salary
count   5.000000      5.000000
mean   28.000000  62000.000000
std     4.949747  14404.860291
min    22.000000  45000.000000
25%    25.000000  50000.000000
50%    28.000000  65000.000000
75%    30.000000  70000.000000
max    35.000000  80000.000000


In [9]:
# Accessing columns
print("\nAge column:")
print(df['Age'])


Age column:
0    25
1    30
2    35
3    28
4    22
Name: Age, dtype: int64


In [13]:
# Adding a new column
df['Experience'] = [3, 7, 12, 5, 1]
print("\nDataFrame with new 'Experience' column:")
print(df)


DataFrame with new 'Experience' column:
      Name  Age           City  Salary  Experience
0    Alice   25       New York   50000           3
1      Bob   30  San Francisco   70000           7
2  Charlie   35    Los Angeles   80000          12
3    David   28        Chicago   65000           5
4      Eva   22         Boston   45000           1


In [15]:
# Filtering rows
print("\nEmployees older than 28:")
print(df[df['Age'] > 28])


Employees older than 28:
      Name  Age           City  Salary  Experience
1      Bob   30  San Francisco   70000           7
2  Charlie   35    Los Angeles   80000          12


In [19]:
# Sorting the DataFrame
print("\nDataFrame sorted by Salary (descending):")
print(df.sort_values('Salary', ascending=False))


DataFrame sorted by Salary (descending):
      Name  Age           City  Salary  Experience
2  Charlie   35    Los Angeles   80000          12
1      Bob   30  San Francisco   70000           7
3    David   28        Chicago   65000           5
0    Alice   25       New York   50000           3
4      Eva   22         Boston   45000           1


In [11]:
# Grouping and aggregation
print("\nAverage salary by city:")
print(df.groupby('City')['Salary'].mean())


Average salary by city:
City
Boston           45000.0
Chicago          65000.0
Los Angeles      80000.0
New York         50000.0
San Francisco    70000.0
Name: Salary, dtype: float64


In [21]:
# Applying a function to a column
df['Salary_After_Tax'] = df['Salary'].apply(lambda x: x * 0.8)
print("\nDataFrame with new 'Salary_After_Tax' column:")
print(df)


DataFrame with new 'Salary_After_Tax' column:
      Name  Age           City  Salary  Experience  Salary_After_Tax
0    Alice   25       New York   50000           3           40000.0
1      Bob   30  San Francisco   70000           7           56000.0
2  Charlie   35    Los Angeles   80000          12           64000.0
3    David   28        Chicago   65000           5           52000.0
4      Eva   22         Boston   45000           1           36000.0


In [23]:
# Handling missing data
df.loc[2, 'Age'] = np.nan
print("\nDataFrame with a missing value:")
print(df)




DataFrame with a missing value:
      Name   Age           City  Salary  Experience  Salary_After_Tax
0    Alice  25.0       New York   50000           3           40000.0
1      Bob  30.0  San Francisco   70000           7           56000.0
2  Charlie   NaN    Los Angeles   80000          12           64000.0
3    David  28.0        Chicago   65000           5           52000.0
4      Eva  22.0         Boston   45000           1           36000.0


In [25]:
print("\nDropping rows with missing values:")
print(df.dropna())



Dropping rows with missing values:
    Name   Age           City  Salary  Experience  Salary_After_Tax
0  Alice  25.0       New York   50000           3           40000.0
1    Bob  30.0  San Francisco   70000           7           56000.0
3  David  28.0        Chicago   65000           5           52000.0
4    Eva  22.0         Boston   45000           1           36000.0


In [16]:
# Renaming columns
df = df.rename(columns={'Salary': 'Annual_Salary'})
print("\nDataFrame with renamed 'Salary' column:")
print(df)


DataFrame with renamed 'Salary' column:
      Name   Age           City  Annual_Salary  Experience  Salary_After_Tax
0    Alice  25.0       New York          50000           3           40000.0
1      Bob  30.0  San Francisco          70000           7           56000.0
2  Charlie   NaN    Los Angeles          80000          12           64000.0
3    David  28.0        Chicago          65000           5           52000.0
4      Eva  22.0         Boston          45000           1           36000.0


In [18]:
# Selecting multiple columns
print("\nSelecting 'Name' and 'Age' columns:")
print(df[['Name', 'Age']])


Selecting 'Name' and 'Age' columns:
      Name   Age
0    Alice  25.0
1      Bob  30.0
2  Charlie   NaN
3    David  28.0
4      Eva  22.0


In [20]:
# Using loc for label-based indexing
print("\nUsing loc to select rows where Age > 28:")
print(df.loc[df['Age'] > 28, ['Name', 'Age', 'City']])


Using loc to select rows where Age > 28:
  Name   Age           City
1  Bob  30.0  San Francisco


In [22]:
# Using iloc for integer-based indexing
print("\nUsing iloc to select the first 3 rows and columns 1-3:")
print(df.iloc[:3, 1:4])


Using iloc to select the first 3 rows and columns 1-3:
    Age           City  Annual_Salary
0  25.0       New York          50000
1  30.0  San Francisco          70000
2   NaN    Los Angeles          80000


In [27]:
# Resetting index
df_reset = df.reset_index(drop=True)
print("\nDataFrame with reset index:")
print(df_reset)


DataFrame with reset index:
      Name   Age           City  Salary  Experience  Salary_After_Tax
0    Alice  25.0       New York   50000           3           40000.0
1      Bob  30.0  San Francisco   70000           7           56000.0
2  Charlie   NaN    Los Angeles   80000          12           64000.0
3    David  28.0        Chicago   65000           5           52000.0
4      Eva  22.0         Boston   45000           1           36000.0


In [26]:
# Merging DataFrames
df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Frank'],
    'Department': ['HR', 'IT', 'Finance', 'Marketing']
})
merged_df = pd.merge(df, df2, on='Name', how='left')
print("\nMerged DataFrame:")
print(merged_df)


Merged DataFrame:
      Name   Age           City  Annual_Salary  Experience  Salary_After_Tax  \
0    Alice  25.0       New York          50000           3           40000.0   
1      Bob  30.0  San Francisco          70000           7           56000.0   
2  Charlie   NaN    Los Angeles          80000          12           64000.0   
3    David  28.0        Chicago          65000           5           52000.0   
4      Eva  22.0         Boston          45000           1           36000.0   

  Department  
0         HR  
1         IT  
2    Finance  
3        NaN  
4        NaN  
