In [225]:
import pandas as pd


# ***Creating a Series***

In [68]:
data = [10, 20, 30, 40]
series = pd.Series(data)
print(series)
series = pd.Series(data, index=['a', 'b', 'c', 'd'])
print(series)

0    10
1    20
2    30
3    40
dtype: int64
a    10
b    20
c    30
d    40
dtype: int64


In [70]:
data_dict = {'a': 1, 'b': 2, 'c': 3}
series_dict = pd.Series(data_dict)
print(series_dict)


a    1
b    2
c    3
dtype: int64


In [71]:
import numpy as np

data_array = np.array([100, 200, 300])
series_array = pd.Series(data_array, index=['x', 'y', 'z'])
print(series_array)


x    100
y    200
z    300
dtype: int64


# ***Creating a DataFrame***

In [72]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Score': [85.5, 90.0, 95.5]
}
df = pd.DataFrame(data)
print(df)

      Name  Age  Score
0    Alice   25   85.5
1      Bob   30   90.0
2  Charlie   35   95.5


In [73]:
data_list = [
    {'Name': 'Alice', 'Age': 25, 'Score': 85.5},
    {'Name': 'Bob', 'Age': 30, 'Score': 90.0},
    {'Name': 'Charlie', 'Age': 35, 'Score': 95.5}
]
df_list = pd.DataFrame(data_list)
print(df_list)

      Name  Age  Score
0    Alice   25   85.5
1      Bob   30   90.0
2  Charlie   35   95.5


In [74]:
data_array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_array = pd.DataFrame(data_array, columns=['A', 'B', 'C'])
print(df_array)


   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [75]:
new_data = {'Col1': [10, 20], 'Col2': [30, 40]}
df1 = pd.DataFrame(new_data)
df2 = pd.DataFrame(df1)
print(df2)

   Col1  Col2
0    10    30
1    20    40


In [76]:
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}
df3 = pd.DataFrame(data, index=['Row1', 'Row2', 'Row3'], columns=['A', 'B', 'C']) # DataFrame with custom index and columns
print(df3)


      A  B  C
Row1  1  4  7
Row2  2  5  8
Row3  3  6  9


In [77]:
empty_df = pd.DataFrame(columns=['Column1', 'Column2']) # Empty DataFrame
print(empty_df)

# Adding data
empty_df.loc[0] = [1, 2]
empty_df.loc[1] = [3, 4]
print(empty_df)

Empty DataFrame
Columns: [Column1, Column2]
Index: []
   Column1  Column2
0        1        2
1        3        4


# **Indexing and Slicing**

## **Accessing Rows and Columns by Labels (.loc[])**

In [78]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Score': [85.5, 90.0, 95.5]
}
df = pd.DataFrame(data, index=['A', 'B', 'C'])




In [79]:
# Access a single row
print(df.loc['A'])

# Access multiple rows
print(df.loc[['A', 'C']])

# Specific rows and columns
print(df.loc['A', 'Age'])  # Single value
print(df.loc[['A', 'C'], ['Name', 'Score']])  # Subset

Name     Alice
Age         25
Score     85.5
Name: A, dtype: object
      Name  Age  Score
A    Alice   25   85.5
C  Charlie   35   95.5
25
      Name  Score
A    Alice   85.5
C  Charlie   95.5


## **Accessing Rows and Columns by Positions (.iloc[])**

In [80]:
# Positional access
# Single row
print(df.iloc[0])

# Multiple rows
print(df.iloc[[0, 2]])

Name     Alice
Age         25
Score     85.5
Name: A, dtype: object
      Name  Age  Score
A    Alice   25   85.5
C  Charlie   35   95.5


In [81]:
# Boolean indexing
filtered = df[df['Age'] > 25]
print(filtered)


      Name  Age  Score
B      Bob   30   90.0
C  Charlie   35   95.5


### **Combining Conditions**

In [82]:
filtered = df[(df['Age'] > 25) & (df['Score'] > 90)]
print(filtered)


      Name  Age  Score
C  Charlie   35   95.5


In [83]:
filtered = df.loc[df['Age'] > 25, ['Name', 'Score']] # Select specific columns from filtered rows
print(filtered)

      Name  Score
B      Bob   90.0
C  Charlie   95.5


## **Slicing Rows and Columns**

In [84]:
# Rows from index 0 to 1 (inclusive)
print(df.iloc[0:2])

    Name  Age  Score
A  Alice   25   85.5
B    Bob   30   90.0


In [85]:
# Columns from index 1 to the end
print(df.iloc[:, 1:])

   Age  Score
A   25   85.5
B   30   90.0
C   35   95.5


# **DataFrame Operations**

## **Arithmetic Operations**

In [86]:
# Sample DataFrame
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6]
}
df = pd.DataFrame(data)

# Add 10 to each value in column A
print(df)

   A  B
0  1  4
1  2  5
2  3  6


In [87]:
print(df)
df['A'] = df['A'] + 10
print(df)

   A  B
0  1  4
1  2  5
2  3  6
    A  B
0  11  4
1  12  5
2  13  6


In [88]:
# Multiply all elements by 2
print(df)
df = df * 2
print(df)

    A  B
0  11  4
1  12  5
2  13  6
    A   B
0  22   8
1  24  10
2  26  12


In [89]:
# Add column A and B
print(df)
df['C'] = df['A'] + df['B']
print(df)


    A   B
0  22   8
1  24  10
2  26  12
    A   B   C
0  22   8  30
1  24  10  34
2  26  12  38


In [90]:
print(df)
df['D'] = df['A'].div(df['B']) # Divide column A by column B
print(df)


    A   B   C
0  22   8  30
1  24  10  34
2  26  12  38
    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667


## **Filtering Data**

In [91]:
# Filter rows where column A > 23
print(df)
filtered = df[df['A'] > 23]
print(filtered)

    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667
    A   B   C         D
1  24  10  34  2.400000
2  26  12  38  2.166667


In [92]:
# Filter rows where A > 23 and B < 12
print(df)
filtered = df[(df['A'] > 23) & (df['B'] < 12)]
print(filtered)


    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667
    A   B   C    D
1  24  10  34  2.4


In [93]:
# Select rows where A > 23, but only display columns A and C
print(df)
filtered = df.loc[df['A'] > 23, ['A', 'C']]
print(filtered)

    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667
    A   C
1  24  34
2  26  38


In [94]:
# Filter rows where Age > 30 and Score < 90
print(df)
filtered_df = df.query('A > 20 & B < 12') # | for OR & for AND ~ for NOT
print(filtered_df)
filtered_df = df[~(df['A'] > 22)]
print(filtered_df)

    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667
    A   B   C     D
0  22   8  30  2.75
1  24  10  34  2.40
    A  B   C     D
0  22  8  30  2.75


In [95]:
# Filter rows where Name is 'Alice' or 'Charlie'
print(df)
filtered_df = df[df['A'].isin([22, 26])]
print(filtered_df)


    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667
    A   B   C         D
0  22   8  30  2.750000
2  26  12  38  2.166667


In [96]:
# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'Score': [85, 90, 78, 88]
}
df3 = pd.DataFrame(data)
print(df3)
# Filter rows where Name starts with 'A'
filtered_df = df3[df3['Name'].str.startswith('A')]
print(filtered_df)
# Filter rows where Name contains 'o'
filtered_df = df3[df3['Name'].str.contains('o')]
print(filtered_df)



      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     78
3    David   40     88
    Name  Age  Score
0  Alice   25     85
  Name  Age  Score
1  Bob   30     90


In [97]:
# Filter rows where Age is between 25 and 35
print(df3)
filtered_df = df3[df3['Age'].between(25, 35)]
print(filtered_df)


      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     78
3    David   40     88
      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     78


In [98]:
# Check if any row has a Score > 90
has_high_score = (df3['Score'] > 90).any()
print(has_high_score)  # Output: False

# Check if all rows have Age > 20
all_above_20 = (df3['Age'] > 20).all()
print(all_above_20)  # Output: True


False
True


In [99]:
# Filter rows where Age > 25 and select only the Name and Score columns
print(df3)
filtered_df = df3[df3['Age'] > 25][['Name', 'Score']]
print(filtered_df)

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     78
3    David   40     88
      Name  Score
1      Bob     90
2  Charlie     78
3    David     88


## **Applying Functions**

In [100]:
# Square each element in column A
print(df)
df['A'] = df['B'].apply(lambda x: x ** 2)
print(df)

    A   B   C         D
0  22   8  30  2.750000
1  24  10  34  2.400000
2  26  12  38  2.166667
     A   B   C         D
0   64   8  30  2.750000
1  100  10  34  2.400000
2  144  12  38  2.166667


In [101]:
# Multiply each element by 2
print(df)
df = df.map(lambda x: x * 2)
print(df)


     A   B   C         D
0   64   8  30  2.750000
1  100  10  34  2.400000
2  144  12  38  2.166667
     A   B   C         D
0  128  16  60  5.500000
1  200  20  68  4.800000
2  288  24  76  4.333333


## **Basic Statistics**

In [102]:
print(df)
print(df.describe())  # Summary statistics
print(df['A'].mean())  # Mean of column A
print(df['B'].sum())   # Sum of column B

     A   B   C         D
0  128  16  60  5.500000
1  200  20  68  4.800000
2  288  24  76  4.333333
                A     B     C         D
count    3.000000   3.0   3.0  3.000000
mean   205.333333  20.0  68.0  4.877778
std     80.133222   4.0   8.0  0.587209
min    128.000000  16.0  60.0  4.333333
25%    164.000000  18.0  64.0  4.566667
50%    200.000000  20.0  68.0  4.800000
75%    244.000000  22.0  72.0  5.150000
max    288.000000  24.0  76.0  5.500000
205.33333333333334
60


## **Sorting Data**

In [103]:
print(df)
sorted_df = df.sort_values(by='A', ascending=False)
print(sorted_df)


     A   B   C         D
0  128  16  60  5.500000
1  200  20  68  4.800000
2  288  24  76  4.333333
     A   B   C         D
2  288  24  76  4.333333
1  200  20  68  4.800000
0  128  16  60  5.500000


In [104]:
# Sort by 'A' (ascending) and 'B' (descending)
print(df)
sorted_df = df.sort_values(by=['A', 'B'], ascending=[True, False])
print(sorted_df)


     A   B   C         D
0  128  16  60  5.500000
1  200  20  68  4.800000
2  288  24  76  4.333333
     A   B   C         D
0  128  16  60  5.500000
1  200  20  68  4.800000
2  288  24  76  4.333333


In [105]:
# Sort by 'Score' and reset the index
print(df)
sorted_df = df.sort_values(by='B', ascending=False)
print(sorted_df)
sorted_df = df.sort_values(by='B', ascending=False, ignore_index=True)
print(sorted_df)


     A   B   C         D
0  128  16  60  5.500000
1  200  20  68  4.800000
2  288  24  76  4.333333
     A   B   C         D
2  288  24  76  4.333333
1  200  20  68  4.800000
0  128  16  60  5.500000
     A   B   C         D
0  288  24  76  4.333333
1  200  20  68  4.800000
2  128  16  60  5.500000


In [106]:
# Add missing values
df.loc[3] = [23, None, None,2.34545]
print(df)
# Sort by 'B' with missing values last
sorted_df = df.sort_values(by='B')
print(sorted_df)
sorted_df = df.sort_values(by='B', na_position='last')
print(sorted_df)

       A     B     C         D
0  128.0  16.0  60.0  5.500000
1  200.0  20.0  68.0  4.800000
2  288.0  24.0  76.0  4.333333
3   23.0   NaN   NaN  2.345450
       A     B     C         D
0  128.0  16.0  60.0  5.500000
1  200.0  20.0  68.0  4.800000
2  288.0  24.0  76.0  4.333333
3   23.0   NaN   NaN  2.345450
       A     B     C         D
0  128.0  16.0  60.0  5.500000
1  200.0  20.0  68.0  4.800000
2  288.0  24.0  76.0  4.333333
3   23.0   NaN   NaN  2.345450


In [107]:
print(df)
sorted_df = df.sort_index()
print(sorted_df)


       A     B     C         D
0  128.0  16.0  60.0  5.500000
1  200.0  20.0  68.0  4.800000
2  288.0  24.0  76.0  4.333333
3   23.0   NaN   NaN  2.345450
       A     B     C         D
0  128.0  16.0  60.0  5.500000
1  200.0  20.0  68.0  4.800000
2  288.0  24.0  76.0  4.333333
3   23.0   NaN   NaN  2.345450


In [108]:
# Sort rows by index in descending order
print(df)
sorted_df = df.sort_index(ascending=False, ignore_index=True)
print(sorted_df)

       A     B     C         D
0  128.0  16.0  60.0  5.500000
1  200.0  20.0  68.0  4.800000
2  288.0  24.0  76.0  4.333333
3   23.0   NaN   NaN  2.345450
       A     B     C         D
0   23.0   NaN   NaN  2.345450
1  288.0  24.0  76.0  4.333333
2  200.0  20.0  68.0  4.800000
3  128.0  16.0  60.0  5.500000


In [109]:
# Sort columns alphabetically
print(df3)
sorted_df = df3.sort_index(axis=1)
print(sorted_df)

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     78
3    David   40     88
   Age     Name  Score
0   25    Alice     85
1   30      Bob     90
2   35  Charlie     78
3   40    David     88


In [110]:
data = {
    'Name': ['Charlie', 'Bob', 'Alice', 'David'],
    'Age': [25, 30, 35, 40],
    'Score': [85, 90, 78, 88]
}
df3 = pd.DataFrame(data)
print(df3)

      Name  Age  Score
0  Charlie   25     85
1      Bob   30     90
2    Alice   35     78
3    David   40     88


In [111]:
# Sort by the length of names
print(df3)
sorted_df = df3.sort_values(by='Name', key=lambda col: col.str.len())
print(sorted_df)


      Name  Age  Score
0  Charlie   25     85
1      Bob   30     90
2    Alice   35     78
3    David   40     88
      Name  Age  Score
1      Bob   30     90
2    Alice   35     78
3    David   40     88
0  Charlie   25     85


# ***Data Cleaning and Manipulation***

# **Handling Missing Data in Pandas**

## **Identifying Missing Data**

In [112]:
data = {
    'Name': ['Alice', 'Bob', None],
    'Age': [25, None, 30],
    'Score': [85.5, 90.0, None]
}
df = pd.DataFrame(data)

print(df)

    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN


In [113]:
# Identify missing values
print(df.isnull())

    Name    Age  Score
0  False  False  False
1  False   True  False
2   True  False   True


In [114]:
print(df.notnull()) # Identify non-missing values


    Name    Age  Score
0   True   True   True
1   True  False   True
2  False   True  False


In [115]:
print(df.isnull().sum()) # Count missing values in each column

Name     1
Age      1
Score    1
dtype: int64


## **Removing Missing Data**

In [116]:
# Drop rows with any missing values
print(df)
cleaned_df = df.dropna()
print(cleaned_df)

    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0   85.5


In [117]:
print(df)
cleaned_df = df.dropna(axis=1) # Drop columns with any missing values
print(cleaned_df)

    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
Empty DataFrame
Columns: []
Index: [0, 1, 2]


In [118]:
# Keep rows with at least 2 non-missing values4
print(df)
cleaned_df = df.dropna(thresh=2)
print(cleaned_df)

    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0


## **Filling Missing Data**

In [119]:
# Fill missing values with 0
print(df)
filled_df = df.fillna(0)
print(filled_df)


    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   0.0   90.0
2      0  30.0    0.0


In [120]:
print(df)
# Fill missing values in Age with the mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

# Fill missing values in Score with the median
df['Score'] = df['Score'].fillna(df['Score'].median())
print(df)


    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0  85.50
1    Bob  27.5  90.00
2   None  30.0  87.75


In [121]:
df = pd.DataFrame(data)
print(df)
# Forward fill (propagate the last valid value forward)
df['Name'] = df['Name'].ffill()
print(df)


    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2    Bob  30.0    NaN


In [122]:
# Backward fill (propagate the next valid value backward)
df = pd.DataFrame(data)
print(df)
df['Age'] = df['Age'].bfill()
print(df)


    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0   85.5
1    Bob  30.0   90.0
2   None  30.0    NaN


## **Interpolating Missing Data**

In [123]:
# Linear interpolation for numeric columns
df = pd.DataFrame(data)
print(df)
df['Age'] = df['Age'].interpolate(method='linear')
print(df)

    Name   Age  Score
0  Alice  25.0   85.5
1    Bob   NaN   90.0
2   None  30.0    NaN
    Name   Age  Score
0  Alice  25.0   85.5
1    Bob  27.5   90.0
2   None  30.0    NaN


# **Renaming Columns and Index**

In [124]:
data = {
    'old_name1': [1, 2, 3],
    'old_name2': [4, 5, 6]
}
df = pd.DataFrame(data)
print(df)

   old_name1  old_name2
0          1          4
1          2          5
2          3          6


In [125]:
df = df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'})
print(df)


   new_name1  new_name2
0          1          4
1          2          5
2          3          6


In [126]:
# Rename index labels
df = df.rename(index={0: 'first', 1: 'second', 2: 'third'})
print(df)


        new_name1  new_name2
first           1          4
second          2          5
third           3          6


In [127]:
# Convert column names to uppercase
print(df)
df = df.rename(columns=str.upper)
print(df)

        new_name1  new_name2
first           1          4
second          2          5
third           3          6
        NEW_NAME1  NEW_NAME2
first           1          4
second          2          5
third           3          6


In [128]:
# Add a prefix to index labels
print(df)
df = df.rename(index=lambda x: f"row_{x}")
print(df)


        NEW_NAME1  NEW_NAME2
first           1          4
second          2          5
third           3          6
            NEW_NAME1  NEW_NAME2
row_first           1          4
row_second          2          5
row_third           3          6


In [129]:
# Rename all columns
print(df)
df.columns = ['Column1', 'Column2']
print(df)


            NEW_NAME1  NEW_NAME2
row_first           1          4
row_second          2          5
row_third           3          6
            Column1  Column2
row_first         1        4
row_second        2        5
row_third         3        6


In [130]:
# Rename all index labels
print(df)
df.index = ['Index1', 'Index2', 'Index3']
print(df)


            Column1  Column2
row_first         1        4
row_second        2        5
row_third         3        6
        Column1  Column2
Index1        1        4
Index2        2        5
Index3        3        6


In [131]:
# Rename columns in place
print(df)
df.rename(columns={'Column1': 'C1', 'Column2': 'C2'}, inplace=True)
print(df)


        Column1  Column2
Index1        1        4
Index2        2        5
Index3        3        6
        C1  C2
Index1   1   4
Index2   2   5
Index3   3   6


In [132]:
print(df)
df = df.reset_index() # Reset index labels
print(df)


        C1  C2
Index1   1   4
Index2   2   5
Index3   3   6
    index  C1  C2
0  Index1   1   4
1  Index2   2   5
2  Index3   3   6


# **Aggregation and Grouping**


# **Grouping Data**

In [133]:
# Sample DataFrame
data = {
    'Category': ['A', 'B', 'A', 'B', 'C'],
    'Value': [10, 20, 30, 40, 50],
    'Quantity': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)
print(df)

  Category  Value  Quantity
0        A     10         1
1        B     20         2
2        A     30         3
3        B     40         4
4        C     50         5


In [134]:
# Group by 'Category' and calculate the sum of 'Value'
print(df)
grouped = df.groupby('Category')['Value'].sum()
print(grouped)

  Category  Value  Quantity
0        A     10         1
1        B     20         2
2        A     30         3
3        B     40         4
4        C     50         5
Category
A    40
B    60
C    50
Name: Value, dtype: int64


In [135]:
# Group by 'Category' and 'Quantity' and calculate the sum of 'Value'
print(df)
grouped = df.groupby(['Category', 'Quantity'])['Value'].sum()
print(grouped)

  Category  Value  Quantity
0        A     10         1
1        B     20         2
2        A     30         3
3        B     40         4
4        C     50         5
Category  Quantity
A         1           10
          3           30
B         2           20
          4           40
C         5           50
Name: Value, dtype: int64


In [136]:
# Add a column with the mean value of each group
print(df)
df['GroupMean'] = df.groupby('Category')['Value'].transform('mean')
print(df)


  Category  Value  Quantity
0        A     10         1
1        B     20         2
2        A     30         3
3        B     40         4
4        C     50         5
  Category  Value  Quantity  GroupMean
0        A     10         1       20.0
1        B     20         2       30.0
2        A     30         3       20.0
3        B     40         4       30.0
4        C     50         5       50.0


In [137]:
# Keep only groups where the sum of 'Value' is greater than 50
print(df)
filtered = df.groupby('Category').filter(lambda group: group['Value'].sum() > 50)
print(filtered)


  Category  Value  Quantity  GroupMean
0        A     10         1       20.0
1        B     20         2       30.0
2        A     30         3       20.0
3        B     40         4       30.0
4        C     50         5       50.0
  Category  Value  Quantity  GroupMean
1        B     20         2       30.0
3        B     40         4       30.0


In [138]:
# Set 'Category' as the index and group by it
df = df.set_index('Category')
grouped = df.groupby(level=0).sum()
print(grouped)

          Value  Quantity  GroupMean
Category                            
A            40         4       40.0
B            60         6       60.0
C            50         5       50.0


# **Aggregation Functions**

In [139]:
# Apply aggregations to the entire DataFrame
print(df)
print(df.sum())   # Sum of all numeric columns
print(df.mean())  # Mean of all numeric columns
print(df.median())  # Median of all numeric columns
print(df.min())   # Min of all numeric columns
print(df.max())   # Max of all numeric columns
print(df.std())   # Standard deviation of all numeric columns
print(df.count()) # Count of non-null values in all columns
print(df.var()) # Variance of all numeric columns
print(df.prod()) # Product of all numeric columns
print(df['Value'].sum())  # Sum of 'Value' column
print(df[['Value', 'Quantity']].mean())

          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
B            20         2       30.0
A            30         3       20.0
B            40         4       30.0
C            50         5       50.0
Value        150.0
Quantity      15.0
GroupMean    150.0
dtype: float64
Value        30.0
Quantity      3.0
GroupMean    30.0
dtype: float64
Value        30.0
Quantity      3.0
GroupMean    30.0
dtype: float64
Value        10.0
Quantity      1.0
GroupMean    20.0
dtype: float64
Value        50.0
Quantity      5.0
GroupMean    50.0
dtype: float64
Value        15.811388
Quantity      1.581139
GroupMean    12.247449
dtype: float64
Value        5
Quantity     5
GroupMean    5
dtype: int64
Value        250.0
Quantity       2.5
GroupMean    150.0
dtype: float64
Value        12000000.0
Quantity          120.0
GroupMean    18000000.0
dtype: float64
150
Value       30.0
Quantity     3.0
dtype: float64


In [140]:
# Group by 'Category' and apply multiple aggregations to 'Value'
print(df)
grouped = df.groupby('Category')['Value'].agg(['sum', 'mean', 'count'])
print(grouped)

          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
B            20         2       30.0
A            30         3       20.0
B            40         4       30.0
C            50         5       50.0
          sum  mean  count
Category                  
A          40  20.0      2
B          60  30.0      2
C          50  50.0      1


In [141]:
# Group by 'Category' and calculate the range of 'Value'
print(df)
def value_range(series):
    return series.max() - series.min()

grouped = df.groupby('Category')['Value'].agg(value_range)
print(grouped)

          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
B            20         2       30.0
A            30         3       20.0
B            40         4       30.0
C            50         5       50.0
Category
A    20
B    20
C     0
Name: Value, dtype: int64


In [142]:
print(df)
grouped = df.groupby('Category')
for name, group in grouped:
    print(f"Group: {name}")
    print(group)


          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
B            20         2       30.0
A            30         3       20.0
B            40         4       30.0
C            50         5       50.0
Group: A
          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
A            30         3       20.0
Group: B
          Value  Quantity  GroupMean
Category                            
B            20         2       30.0
B            40         4       30.0
Group: C
          Value  Quantity  GroupMean
Category                            
C            50         5       50.0


In [143]:
# Group by 'Category' and reset the index
print(df)
grouped = df.groupby('Category')['Value'].sum().reset_index()
print(grouped)


          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
B            20         2       30.0
A            30         3       20.0
B            40         4       30.0
C            50         5       50.0
  Category  Value
0        A     40
1        B     60
2        C     50


In [144]:
# Apply different aggregations to 'Value' and 'Quantity'
print(df)
grouped = df.groupby('Category').agg({
    'Value': 'sum',
    'Quantity': 'mean'
})
print(grouped)


          Value  Quantity  GroupMean
Category                            
A            10         1       20.0
B            20         2       30.0
A            30         3       20.0
B            40         4       30.0
C            50         5       50.0
          Value  Quantity
Category                 
A            40       2.0
B            60       3.0
C            50       5.0


# **Pivot Tables in Pandas**

In [145]:
# Sample DataFrame
data = {
    'Region': ['East', 'West', 'East', 'West', 'East', 'West'],
    'Product': ['A', 'A', 'B', 'B', 'C', 'C'],
    'Sales': [100, 150, 200, 250, 300, 350],
}
df = pd.DataFrame(data)
print(df)
# Create a pivot table
pivot_table = df.pivot_table(
    values='Sales',
    index='Region',
    columns='Product',
    aggfunc='sum'
)
print(pivot_table)

  Region Product  Sales
0   East       A    100
1   West       A    150
2   East       B    200
3   West       B    250
4   East       C    300
5   West       C    350
Product    A    B    C
Region                
East     100  200  300
West     150  250  350


In [146]:
# Create a pivot table with multiple aggregation functions
pivot_table = df.pivot_table(
    values='Sales',
    index='Region',
    columns='Product',
    aggfunc=['sum', 'mean']
)
print(pivot_table)

         sum             mean              
Product    A    B    C      A      B      C
Region                                     
East     100  200  300  100.0  200.0  300.0
West     150  250  350  150.0  250.0  350.0


In [147]:
pivot_table = df.pivot_table(
    values='Sales',
    index='Region',
    columns='Product',
    aggfunc='sum',
    fill_value=0
) # Fill missing values with 0
print(pivot_table)

Product    A    B    C
Region                
East     100  200  300
West     150  250  350


In [148]:
pivot_table = df.pivot_table(
    values='Sales',
    index='Region',
    columns='Product',
    aggfunc='sum',
    margins=True
) # Add row and column totals
print(pivot_table)


Product    A    B    C   All
Region                      
East     100  200  300   600
West     150  250  350   750
All      250  450  650  1350


In [149]:
# Sample DataFrame with additional grouping by Year
data = {
    'Year': [2020, 2020, 2021, 2021, 2021],
    'Region': ['East', 'West', 'East', 'West', 'East'],
    'Product': ['A', 'A', 'B', 'B', 'C'],
    'Sales': [100, 150, 200, 250, 300],
}
df = pd.DataFrame(data)

# Create a multi-level pivot table with both 'Year' and 'Region' as indexes
pivot_table = df.pivot_table(
    values='Sales',
    index=['Year', 'Region'],
    columns='Product',
    aggfunc='sum'
)
print(pivot_table)


Product          A      B      C
Year Region                     
2020 East    100.0    NaN    NaN
     West    150.0    NaN    NaN
2021 East      NaN  200.0  300.0
     West      NaN  250.0    NaN


In [150]:
# Define a custom aggregation function to find the range (max - min)
def value_range(series):
    return series.max() - series.min()

# Create a pivot table with custom aggregation
pivot_table = df.pivot_table(
    values='Sales',
    index='Region',
    columns='Product',
    aggfunc=value_range
)
print(pivot_table)


Product    A    B    C
Region                
East     0.0  0.0  0.0
West     0.0  0.0  NaN


# ***Merging and Joining***

# **Merging DataFrames**

In [151]:
df1 = pd.DataFrame({
    'ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
})

df2 = pd.DataFrame({
    'ID': [2, 3, 4],
    'Score': [85, 90, 95]
})



In [152]:
# Inner Join
result = pd.merge(df1, df2, on='ID')
print(result)

   ID     Name  Score
0   2      Bob     85
1   3  Charlie     90


In [153]:
result = pd.merge(df1, df2, on='ID', how='outer') # Outer Join
print(result)


   ID     Name  Score
0   1    Alice    NaN
1   2      Bob   85.0
2   3  Charlie   90.0
3   4      NaN   95.0


In [154]:
result = pd.merge(df1, df2, on='ID', how='left')
print(result)


   ID     Name  Score
0   1    Alice    NaN
1   2      Bob   85.0
2   3  Charlie   90.0


In [155]:
result = pd.merge(df1, df2, on='ID', how='right')
print(result)


   ID     Name  Score
0   2      Bob     85
1   3  Charlie     90
2   4      NaN     95


In [156]:
df3 = pd.DataFrame({
    'User_ID': [2, 3, 4],
    'Score': [85, 90, 95]
})

result = pd.merge(df1, df3, left_on='ID', right_on='User_ID') # Merge on 'ID' and 'User_ID'
print(result)


   ID     Name  User_ID  Score
0   2      Bob        2     85
1   3  Charlie        3     90


In [157]:
df4 = pd.DataFrame({
    'Score': [85, 90, 95]
}, index=[2, 3, 4])

result = pd.merge(df1, df4, left_on='ID', right_index=True)
print(result)

   ID     Name  Score
1   2      Bob     85
2   3  Charlie     90


# **The .join() Method**

In [158]:
df1 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}, index=[1, 2, 3])

df2 = pd.DataFrame({
    'Score': [85, 90, 95]
}, index=[2, 3, 4])



In [159]:
result = df1.join(df2)
print(result)


      Name  Age  Score
1    Alice   25    NaN
2      Bob   30   85.0
3  Charlie   35   90.0


In [160]:
result = df1.join(df2, how='inner') # Inner join
print(result)


      Name  Age  Score
2      Bob   30     85
3  Charlie   35     90


# **Concatenation DataFrames**

In [161]:
# Sample DataFrames
df1 = pd.DataFrame({
    'A': [1, 2],
    'B': [3, 4]
})

df2 = pd.DataFrame({
    'A': [5, 6],
    'B': [7, 8]
})




In [162]:
# Concatenate along rows
result = pd.concat([df1, df2])
print(result)

   A  B
0  1  3
1  2  4
0  5  7
1  6  8


In [163]:
# Concatenate along columns
result = pd.concat([df1, df2], axis=1)
print(result)


   A  B  A  B
0  1  3  5  7
1  2  4  6  8


In [164]:
result = pd.concat([df1, df2], ignore_index=True)
print(result)


   A  B
0  1  3
1  2  4
2  5  7
3  6  8


In [165]:
df3 = pd.DataFrame({
    'A': [9, 10],
    'C': [11, 12]
})

result = pd.concat([df1, df3])
print(result)

    A    B     C
0   1  3.0   NaN
1   2  4.0   NaN
0   9  NaN  11.0
1  10  NaN  12.0


In [166]:
result = pd.concat([df1, df3], join='inner')
print(result)


    A
0   1
1   2
0   9
1  10


In [167]:
result = pd.concat([df1, df2], keys=['First', 'Second'])
print(result)


          A  B
First  0  1  3
       1  2  4
Second 0  5  7
       1  6  8


In [168]:
df4 = pd.DataFrame({
    'C': [13, 14]
}, index=[1, 2])

result = pd.concat([df1, df4], axis=1)
print(result)

     A    B     C
0  1.0  3.0   NaN
1  2.0  4.0  13.0
2  NaN  NaN  14.0


# **Time Series Analysis**

# **Datetime Operations**

In [169]:
# Converting strings to datetime
dates = ['2025-01-01', '2025-01-02', '2025-01-03']
datetime_series = pd.to_datetime(dates)

print(datetime_series)


DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03'], dtype='datetime64[ns]', freq=None)


In [170]:
# Generate a daily date range
date_range = pd.date_range(start='2025-01-01', end='2025-01-10', freq='D')
print(date_range)

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06', '2025-01-07', '2025-01-08',
               '2025-01-09', '2025-01-10'],
              dtype='datetime64[ns]', freq='D')


In [171]:
# Sample datetime series
data = pd.Series(pd.date_range('2025-01-01', periods=5, freq='D'))

# Extracting components
print("Year:", data.dt.year)
print("Month:", data.dt.month)
print("Day:", data.dt.day)
print("Weekday:", data.dt.weekday)

Year: 0    2025
1    2025
2    2025
3    2025
4    2025
dtype: int32
Month: 0    1
1    1
2    1
3    1
4    1
dtype: int32
Day: 0    1
1    2
2    3
3    4
4    5
dtype: int32
Weekday: 0    2
1    3
2    4
3    5
4    6
dtype: int32


In [172]:
# Creating a DataFrame with datetime index
df = pd.DataFrame({
    'Value': [10, 20, 15, 25, 30]
}, index=pd.date_range('2025-01-01', periods=5, freq='D'))

print(df)

            Value
2025-01-01     10
2025-01-02     20
2025-01-03     15
2025-01-04     25
2025-01-05     30


In [173]:
# Resample to calculate weekly sum
weekly = df.resample('W').sum()
print(weekly)


            Value
2025-01-05    100


In [174]:
df2 = pd.DataFrame({
    'Value': [50, 60]
}, index=pd.date_range('2025-01-03', periods=2, freq='D'))
print(df)
# Combine data
combined = df.add(df2, fill_value=0)
print(combined)


            Value
2025-01-01     10
2025-01-02     20
2025-01-03     15
2025-01-04     25
2025-01-05     30
            Value
2025-01-01   10.0
2025-01-02   20.0
2025-01-03   65.0
2025-01-04   85.0
2025-01-05   30.0


In [175]:
# Shifting values
print(df)
df['Shifted'] = df['Value'].shift(1)
print(df)


            Value
2025-01-01     10
2025-01-02     20
2025-01-03     15
2025-01-04     25
2025-01-05     30
            Value  Shifted
2025-01-01     10      NaN
2025-01-02     20     10.0
2025-01-03     15     20.0
2025-01-04     25     15.0
2025-01-05     30     25.0


In [176]:
# Calculate a rolling mean with a window of 2
print(df)
df['Rolling_Mean'] = df['Value'].rolling(window=2).mean()
print(df)


            Value  Shifted
2025-01-01     10      NaN
2025-01-02     20     10.0
2025-01-03     15     20.0
2025-01-04     25     15.0
2025-01-05     30     25.0
            Value  Shifted  Rolling_Mean
2025-01-01     10      NaN           NaN
2025-01-02     20     10.0          15.0
2025-01-03     15     20.0          17.5
2025-01-04     25     15.0          20.0
2025-01-05     30     25.0          27.5


In [177]:
# Reindexing with a full date range
print(df)
full_range = pd.date_range('2025-01-01', '2025-01-10', freq='D')
df = df.reindex(full_range)
df['Value'] = df['Value'].fillna(0)  # Fill missing values
print(df)


            Value  Shifted  Rolling_Mean
2025-01-01     10      NaN           NaN
2025-01-02     20     10.0          15.0
2025-01-03     15     20.0          17.5
2025-01-04     25     15.0          20.0
2025-01-05     30     25.0          27.5
            Value  Shifted  Rolling_Mean
2025-01-01   10.0      NaN           NaN
2025-01-02   20.0     10.0          15.0
2025-01-03   15.0     20.0          17.5
2025-01-04   25.0     15.0          20.0
2025-01-05   30.0     25.0          27.5
2025-01-06    0.0      NaN           NaN
2025-01-07    0.0      NaN           NaN
2025-01-08    0.0      NaN           NaN
2025-01-09    0.0      NaN           NaN
2025-01-10    0.0      NaN           NaN


In [178]:
# Localizing timezone
print(df)
df.index = df.index.tz_localize('UTC')
print(df)
# Converting timezone
df.index = df.index.tz_convert('US/Eastern')
print(df)

            Value  Shifted  Rolling_Mean
2025-01-01   10.0      NaN           NaN
2025-01-02   20.0     10.0          15.0
2025-01-03   15.0     20.0          17.5
2025-01-04   25.0     15.0          20.0
2025-01-05   30.0     25.0          27.5
2025-01-06    0.0      NaN           NaN
2025-01-07    0.0      NaN           NaN
2025-01-08    0.0      NaN           NaN
2025-01-09    0.0      NaN           NaN
2025-01-10    0.0      NaN           NaN
                           Value  Shifted  Rolling_Mean
2025-01-01 00:00:00+00:00   10.0      NaN           NaN
2025-01-02 00:00:00+00:00   20.0     10.0          15.0
2025-01-03 00:00:00+00:00   15.0     20.0          17.5
2025-01-04 00:00:00+00:00   25.0     15.0          20.0
2025-01-05 00:00:00+00:00   30.0     25.0          27.5
2025-01-06 00:00:00+00:00    0.0      NaN           NaN
2025-01-07 00:00:00+00:00    0.0      NaN           NaN
2025-01-08 00:00:00+00:00    0.0      NaN           NaN
2025-01-09 00:00:00+00:00    0.0      NaN    

# **Resampling and Shifting**

In [179]:
# Sample time-series data
data = pd.DataFrame({
    'Value': [10, 20, 30, 40, 50, 60]
}, index=pd.date_range('2025-01-01', periods=6, freq='D'))

# Downsample to weekly frequency and calculate the sum
weekly = data.resample('W').sum()
print(weekly)


            Value
2025-01-05    150
2025-01-12     60


In [180]:
# Upsample to daily frequency (filling with NaN)
print(weekly)
upsampled = weekly.resample('D').asfreq()
print(upsampled)


            Value
2025-01-05    150
2025-01-12     60
            Value
2025-01-05  150.0
2025-01-06    NaN
2025-01-07    NaN
2025-01-08    NaN
2025-01-09    NaN
2025-01-10    NaN
2025-01-11    NaN
2025-01-12   60.0


In [181]:
# Calculate the mean for weekly frequency
weekly_mean = data.resample('W').mean()
print(weekly_mean)


            Value
2025-01-05   30.0
2025-01-12   60.0


In [182]:
# Apply custom aggregation (e.g., sum of squares)
custom = data.resample('W').apply(lambda x: (x**2).sum())
print(custom)


            Value
2025-01-05   5500
2025-01-12   3600


In [183]:
# Shift data forward by 1 period
print(data)
shifted_forward = data.shift(1)
print(shifted_forward)


            Value
2025-01-01     10
2025-01-02     20
2025-01-03     30
2025-01-04     40
2025-01-05     50
2025-01-06     60
            Value
2025-01-01    NaN
2025-01-02   10.0
2025-01-03   20.0
2025-01-04   30.0
2025-01-05   40.0
2025-01-06   50.0


In [184]:
# Shift index by 2 days
#shifted_by_days = data.tshift(2, freq='D')
#print(shifted_by_days)

In [185]:
# Add lagged and leading columns
data['Lagged'] = data['Value'].shift(1)
data['Leading'] = data['Value'].shift(-1)
print(data)


            Value  Lagged  Leading
2025-01-01     10     NaN     20.0
2025-01-02     20    10.0     30.0
2025-01-03     30    20.0     40.0
2025-01-04     40    30.0     50.0
2025-01-05     50    40.0     60.0
2025-01-06     60    50.0      NaN


In [186]:
# Resample to monthly frequency and shift by one month
monthly_shifted = data.resample('ME').sum().shift(1)
print(monthly_shifted)


            Value  Lagged  Leading
2025-01-31    NaN     NaN      NaN


# **Date Range Generation**

In [187]:
# Generate a daily date range
dates = pd.date_range(start='2025-01-01', end='2025-01-10', freq='D')
print(dates)


DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06', '2025-01-07', '2025-01-08',
               '2025-01-09', '2025-01-10'],
              dtype='datetime64[ns]', freq='D')


In [188]:
# Generate a date range with a fixed number of periods
dates = pd.date_range(start='2025-01-01', periods=5, freq='D')
print(dates)


DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05'],
              dtype='datetime64[ns]', freq='D')


In [189]:
# Generate a date range ending on a specific date
dates = pd.date_range(end='2025-01-10', periods=5, freq='D')
print(dates)


DatetimeIndex(['2025-01-06', '2025-01-07', '2025-01-08', '2025-01-09',
               '2025-01-10'],
              dtype='datetime64[ns]', freq='D')


In [190]:
# Generate a range with timezone
dates = pd.date_range(start='2025-01-01', periods=3, freq='D', tz='UTC')
print(dates)


DatetimeIndex(['2025-01-01 00:00:00+00:00', '2025-01-02 00:00:00+00:00',
               '2025-01-03 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')


In [191]:
# Create a DataFrame with a date range as the index
data = pd.DataFrame({'Value': [10, 20, 30]}, index=pd.date_range('2025-01-01', periods=3, freq='D'))
print(data)

            Value
2025-01-01     10
2025-01-02     20
2025-01-03     30


# ***Data Manipulation***

# **String Manipulation**

In [192]:
# Sample data
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'City': ['New York', 'Los Angeles', 'San Francisco']
})

# Access string methods
print(data['Name'].str.upper())

0      ALICE
1        BOB
2    CHARLIE
Name: Name, dtype: object


In [193]:
# Convert to lowercase
data['Name_Lower'] = data['Name'].str.lower()
print(data)

# Convert to uppercase
data['Name_Upper'] = data['Name'].str.upper()
print(data)

# Capitalize first letter
data['Name_Cap'] = data['Name'].str.capitalize()
print(data)


      Name           City Name_Lower
0    Alice       New York      alice
1      Bob    Los Angeles        bob
2  Charlie  San Francisco    charlie
      Name           City Name_Lower Name_Upper
0    Alice       New York      alice      ALICE
1      Bob    Los Angeles        bob        BOB
2  Charlie  San Francisco    charlie    CHARLIE
      Name           City Name_Lower Name_Upper Name_Cap
0    Alice       New York      alice      ALICE    Alice
1      Bob    Los Angeles        bob        BOB      Bob
2  Charlie  San Francisco    charlie    CHARLIE  Charlie


In [194]:
# Remove leading and trailing whitespaces
data['City_Trimmed'] = data['City'].str.strip()
print(data)

# Add padding
data['City_Padded'] = data['City'].str.pad(20, side='left', fillchar='-')
print(data)


      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed
0    Alice       New York      alice      ALICE    Alice       New York
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco
      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded  
0  ------------New York  
1  ---------Los Angeles  
2  -------San Francisco  


In [195]:
# Extract first 3 characters
data['Name_Substr'] = data['Name'].str[:3]
print(data)
# Extract using a specific range
data['City_Part'] = data['City'].str[4:8]
print(data)


      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr  
0  ------------New York         Ali  
1  ---------Los Angeles         Bob  
2  -------San Francisco         Cha  
      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  
0  ------------New York         Ali      York  
1  ---------Los Angeles         Bob      Ange  
2  -------San Francisco         Cha      Fran  


In [196]:
# Replace substrings
data['City_Replaced'] = data['City'].str.replace(' ', '_', regex=False)
print(data)

      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  City_Replaced  
0  ------------New York         Ali      York       New_York  
1  ---------Los Angeles         Bob      Ange    Los_Angeles  
2  -------San Francisco         Cha      Fran  San_Francisco  


In [197]:
# Split strings into lists
data['City_Split'] = data['City'].str.split(' ')
print(data)

# Join strings
data['City_Joined'] = data['City_Split'].str.join('-')
print(data)


      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  City_Replaced        City_Split  
0  ------------New York         Ali      York       New_York       [New, York]  
1  ---------Los Angeles         Bob      Ange    Los_Angeles    [Los, Angeles]  
2  -------San Francisco         Cha      Fran  San_Francisco  [San, Francisco]  
      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  City_Replaced  \
0  -------

In [198]:
# Check if strings start with a specific character
data['Starts_With_S'] = data['Name'].str.startswith('S')
print(data)
# Check if strings contain digits
data['Has_Digits'] = data['City'].str.isdigit()
print(data)

      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  City_Replaced  \
0  ------------New York         Ali      York       New_York   
1  ---------Los Angeles         Bob      Ange    Los_Angeles   
2  -------San Francisco         Cha      Fran  San_Francisco   

         City_Split    City_Joined  Starts_With_S  
0       [New, York]       New-York          False  
1    [Los, Angeles]    Los-Angeles          False  
2  [San, Francisco]  San-Francisco          False  
      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charli

In [199]:
# Extract numbers from strings
data['Extract_Number'] = data['City'].str.extract('(\d+)')
print(data)

# Find all matches (returns lists of matches)
data['Find_All'] = data['City'].str.findall('[A-Za-z]+')
print(data)

      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  City_Replaced  \
0  ------------New York         Ali      York       New_York   
1  ---------Los Angeles         Bob      Ange    Los_Angeles   
2  -------San Francisco         Cha      Fran  San_Francisco   

         City_Split    City_Joined  Starts_With_S  Has_Digits Extract_Number  
0       [New, York]       New-York          False       False            NaN  
1    [Los, Angeles]    Los-Angeles          False       False            NaN  
2  [San, Francisco]  San-Francisco          False       False            NaN  
      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    

In [200]:
# Custom function
data['Custom'] = data['Name'].apply(lambda x: x[::-1])  # Reverse string
print(data)

      Name           City Name_Lower Name_Upper Name_Cap   City_Trimmed  \
0    Alice       New York      alice      ALICE    Alice       New York   
1      Bob    Los Angeles        bob        BOB      Bob    Los Angeles   
2  Charlie  San Francisco    charlie    CHARLIE  Charlie  San Francisco   

            City_Padded Name_Substr City_Part  City_Replaced  \
0  ------------New York         Ali      York       New_York   
1  ---------Los Angeles         Bob      Ange    Los_Angeles   
2  -------San Francisco         Cha      Fran  San_Francisco   

         City_Split    City_Joined  Starts_With_S  Has_Digits Extract_Number  \
0       [New, York]       New-York          False       False            NaN   
1    [Los, Angeles]    Los-Angeles          False       False            NaN   
2  [San, Francisco]  San-Francisco          False       False            NaN   

           Find_All   Custom  
0       [New, York]    ecilA  
1    [Los, Angeles]      boB  
2  [San, Francisco]  eilrahC

# ***Window Functions***

In [201]:
data = pd.DataFrame({'Value': [1, 2, 3, 4, 5, 6]})
data['Rolling_Mean'] = data['Value'].rolling(window=3).mean()
print(data)

   Value  Rolling_Mean
0      1           NaN
1      2           NaN
2      3           2.0
3      4           3.0
4      5           4.0
5      6           5.0


In [202]:
data['Rolling_Sum'] = data['Value'].rolling(window=3).sum()
print(data)


   Value  Rolling_Mean  Rolling_Sum
0      1           NaN          NaN
1      2           NaN          NaN
2      3           2.0          6.0
3      4           3.0          9.0
4      5           4.0         12.0
5      6           5.0         15.0


In [203]:
data['Expanding_Mean'] = data['Value'].expanding().mean()
print(data)

   Value  Rolling_Mean  Rolling_Sum  Expanding_Mean
0      1           NaN          NaN             1.0
1      2           NaN          NaN             1.5
2      3           2.0          6.0             2.0
3      4           3.0          9.0             2.5
4      5           4.0         12.0             3.0
5      6           5.0         15.0             3.5


In [204]:
data['Expanding_Sum'] = data['Value'].expanding().sum()
print(data)


   Value  Rolling_Mean  Rolling_Sum  Expanding_Mean  Expanding_Sum
0      1           NaN          NaN             1.0            1.0
1      2           NaN          NaN             1.5            3.0
2      3           2.0          6.0             2.0            6.0
3      4           3.0          9.0             2.5           10.0
4      5           4.0         12.0             3.0           15.0
5      6           5.0         15.0             3.5           21.0


In [205]:
data['EWM_Mean'] = data['Value'].ewm(span=3, adjust=False).mean()
print(data)


   Value  Rolling_Mean  Rolling_Sum  Expanding_Mean  Expanding_Sum  EWM_Mean
0      1           NaN          NaN             1.0            1.0   1.00000
1      2           NaN          NaN             1.5            3.0   1.50000
2      3           2.0          6.0             2.0            6.0   2.25000
3      4           3.0          9.0             2.5           10.0   3.12500
4      5           4.0         12.0             3.0           15.0   4.06250
5      6           5.0         15.0             3.5           21.0   5.03125


In [206]:
data['Rolling_Custom'] = data['Value'].rolling(window=3).agg(lambda x: x.max() - x.min())
print(data)

   Value  Rolling_Mean  Rolling_Sum  Expanding_Mean  Expanding_Sum  EWM_Mean  \
0      1           NaN          NaN             1.0            1.0   1.00000   
1      2           NaN          NaN             1.5            3.0   1.50000   
2      3           2.0          6.0             2.0            6.0   2.25000   
3      4           3.0          9.0             2.5           10.0   3.12500   
4      5           4.0         12.0             3.0           15.0   4.06250   
5      6           5.0         15.0             3.5           21.0   5.03125   

   Rolling_Custom  
0             NaN  
1             NaN  
2             2.0  
3             2.0  
4             2.0  
5             2.0  


In [207]:
time_data = pd.DataFrame({
    'Date': pd.date_range(start='2025-01-01', periods=10),
    'Value': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
})

time_data.set_index('Date', inplace=True)
time_data['Rolling_Mean'] = time_data['Value'].rolling(window=3).mean()
print(time_data)

            Value  Rolling_Mean
Date                           
2025-01-01      1           NaN
2025-01-02      2           NaN
2025-01-03      3           2.0
2025-01-04      4           3.0
2025-01-05      5           4.0
2025-01-06      6           5.0
2025-01-07      7           6.0
2025-01-08      8           7.0
2025-01-09      9           8.0
2025-01-10     10           9.0


# **Apply Functions**

In [208]:
data = pd.Series([1, 2, 3, 4, 5])

# Apply a function to square each element
data_squared = data.apply(lambda x: x ** 2)
print(data_squared)

0     1
1     4
2     9
3    16
4    25
dtype: int64


In [209]:
# Apply a function to categorize values
data_categorized = data.apply(lambda x: 'Even' if x % 2 == 0 else 'Odd')
print(data_categorized)

0     Odd
1    Even
2     Odd
3    Even
4     Odd
dtype: object


In [210]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Apply a function to each column
result = df.apply(lambda x: x.sum(), axis=0)
print(result)


A     6
B    15
dtype: int64


In [211]:
# Apply a function to each row
result = df.apply(lambda x: x.sum(), axis=1)
print(result)


0    5
1    7
2    9
dtype: int64


In [212]:
# Add a new column based on row values
df['C'] = df.apply(lambda x: x['A'] * x['B'], axis=1)
print(df)

   A  B   C
0  1  4   4
1  2  5  10
2  3  6  18


In [213]:
# Apply a function to every element
result = df.map(lambda x: x ** 2)
print(result)


   A   B    C
0  1  16   16
1  4  25  100
2  9  36  324


In [214]:
# Custom function with multiple arguments
def custom_function(x, factor):
    return x * factor

data_transformed = data.apply(custom_function, args=(10,))
print(data_transformed)

0    10
1    20
2    30
3    40
4    50
dtype: int64


# **Read Data From Excel and csv**

In [215]:
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStsdzk_-1KtqSH8EraxBtczeIYXBwAEFL1rMj9fPZRKIOba6Ks3-HoEBmEpj-AEsypmJw_RM7cai5v/pub?gid=0&single=true&output=csv")



In [216]:
# View the first few rows
print(data.head())

                           industry  level             size line_code  value
0                             total      0   6–19 employees  C0300.01  15639
1                             total      0  20–49 employees  C0300.01   2943
2                             total      0  50–99 employees  C0300.01    639
3                             total      0   100+ employees  C0300.01    555
4  Agriculture, forestry, & fishing      1            total  C0300.01    348


In [217]:
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStsdzk_-1KtqSH8EraxBtczeIYXBwAEFL1rMj9fPZRKIOba6Ks3-HoEBmEpj-AEsypmJw_RM7cai5v/pub?gid=0&single=true&output=csv", sep=";")
print(data)

                  industry,level,size,line_code,value
0               total,0,6–19 employees,C0300.01,15639
1               total,0,20–49 employees,C0300.01,2943
2                total,0,50–99 employees,C0300.01,639
3                 total,0,100+ employees,C0300.01,555
4   Agriculture, forestry, & fishing,1,total,C0300...
..                                                ...
94               total,0,6–19 employees,C0300.03,4290
95              total,0,20–49 employees,C0300.03,1062
96               total,0,50–99 employees,C0300.03,438
97                total,0,100+ employees,C0300.03,339
98  Agriculture, forestry, & fishing,1,total,C0300...

[99 rows x 1 columns]


In [218]:
#data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStsdzk_-1KtqSH8EraxBtczeIYXBwAEFL1rMj9fPZRKIOba6Ks3-HoEBmEpj-AEsypmJw_RM7cai5v/pub?gid=0&single=true&output=csv", index_col="id")

In [219]:
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStsdzk_-1KtqSH8EraxBtczeIYXBwAEFL1rMj9fPZRKIOba6Ks3-HoEBmEpj-AEsypmJw_RM7cai5v/pub?gid=0&single=true&output=csv", usecols=["level", "line_code"])
print(data)

    level line_code
0       0  C0300.01
1       0  C0300.01
2       0  C0300.01
3       0  C0300.01
4       1  C0300.01
..    ...       ...
94      0  C0300.03
95      0  C0300.03
96      0  C0300.03
97      0  C0300.03
98      1  C0300.03

[99 rows x 2 columns]


In [220]:
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStsdzk_-1KtqSH8EraxBtczeIYXBwAEFL1rMj9fPZRKIOba6Ks3-HoEBmEpj-AEsypmJw_RM7cai5v/pub?gid=0&single=true&output=csv", usecols=["level", "line_code"],nrows=10)
print(data)

   level line_code
0      0  C0300.01
1      0  C0300.01
2      0  C0300.01
3      0  C0300.01
4      1  C0300.01
5      2  C0300.01
6      2  C0300.01
7      2  C0300.01
8      2  C0300.01
9      1  C0300.01


In [221]:
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStsdzk_-1KtqSH8EraxBtczeIYXBwAEFL1rMj9fPZRKIOba6Ks3-HoEBmEpj-AEsypmJw_RM7cai5v/pub?gid=0&single=true&output=csv",nrows=10,na_values=["N/A", "null","total"])
print(data)

                                            industry  level             size  \
0                                                NaN      0   6–19 employees   
1                                                NaN      0  20–49 employees   
2                                                NaN      0  50–99 employees   
3                                                NaN      0   100+ employees   
4                   Agriculture, forestry, & fishing      1              NaN   
5                                        Agriculture      2              NaN   
6                                 Commercial fishing      2              NaN   
7                                 Forestry & logging      2              NaN   
8  Agriculture, forestry, & fishing support services      2              NaN   
9                                             Mining      1              NaN   

  line_code  value  
0  C0300.01  15639  
1  C0300.01   2943  
2  C0300.01    639  
3  C0300.01    555  
4  C0300.01   

In [222]:
data["industry"].fillna("Unknown", inplace=True)

# Fill missing Salary with the average
#data["size"].fillna(data["size"].mean(), inplace=True)

print(data)

                                            industry  level             size  \
0                                            Unknown      0   6–19 employees   
1                                            Unknown      0  20–49 employees   
2                                            Unknown      0  50–99 employees   
3                                            Unknown      0   100+ employees   
4                   Agriculture, forestry, & fishing      1              NaN   
5                                        Agriculture      2              NaN   
6                                 Commercial fishing      2              NaN   
7                                 Forestry & logging      2              NaN   
8  Agriculture, forestry, & fishing support services      2              NaN   
9                                             Mining      1              NaN   

  line_code  value  
0  C0300.01  15639  
1  C0300.01   2943  
2  C0300.01    639  
3  C0300.01    555  
4  C0300.01   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["industry"].fillna("Unknown", inplace=True)


In [223]:
filtered_data = data[(data["level"] > 1) & (data["value"] > 100)]
print(filtered_data)

                                            industry  level size line_code  \
5                                        Agriculture      2  NaN  C0300.01   
8  Agriculture, forestry, & fishing support services      2  NaN  C0300.01   

   value  
5    177  
8    159  


In [224]:
data.to_csv("output.csv")