# Pandas Tutorial

This notebook demonstrates the basic functionalities of Pandas, a powerful data manipulation library for Python.

In [1]:
import pandas as pd
import numpy as np

## 1. Creating DataFrames

In [2]:
# Create a DataFrame from a dictionary
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': ['a', 'b', 'c'],
    'C': [4.5, 5.5, 6.5]
})

print(df)

# Create a DataFrame from a list of dictionaries
df2 = pd.DataFrame([
    {'name': 'John', 'age': 30},
    {'name': 'Alice', 'age': 25}
])

print(df2)

   A  B    C
0  1  a  4.5
1  2  b  5.5
2  3  c  6.5
    name  age
0   John   30
1  Alice   25


## 2. Reading and Writing Data

In [10]:
# Reading a CSV file
# df = pd.read_csv('example.csv')  # Uncomment and replace with your file path

# Writing to a CSV file
df.to_csv('output.csv', index=False)

# Reading an Excel file
# df_excel = pd.read_excel('example.xlsx')  # Uncomment and replace with your file path

# Writing to an Excel file
df.to_excel('output.xlsx', index=False)

ModuleNotFoundError: No module named 'openpyxl'

## 3. Basic Operations

In [5]:
# Viewing the first few rows
print(df.head())

# Getting basic information about the DataFrame
print(df.info())

# Descriptive statistics
print(df.describe())

# Selecting a single column
print(df['A'])

# Selecting multiple columns
print(df[['A', 'B']])

# Filtering rows
print(df[df['A'] > 1])

   A  B    C
0  1  a  4.5
1  2  b  5.5
2  3  c  6.5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      int64  
 1   B       3 non-null      object 
 2   C       3 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 204.0+ bytes
None
         A    C
count  3.0  3.0
mean   2.0  5.5
std    1.0  1.0
min    1.0  4.5
25%    1.5  5.0
50%    2.0  5.5
75%    2.5  6.0
max    3.0  6.5
0    1
1    2
2    3
Name: A, dtype: int64
   A  B
0  1  a
1  2  b
2  3  c
   A  B    C
1  2  b  5.5
2  3  c  6.5


## 4. Data Cleaning

In [6]:
# Create a DataFrame with missing values
df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})

print("Original DataFrame:")
print(df_missing)

# Dropping rows with any missing values
print("\nAfter dropping rows with missing values:")
print(df_missing.dropna())

# Filling missing values
print("\nAfter filling missing values with 0:")
print(df_missing.fillna(0))

# Replacing values
print("\nAfter replacing 1 with 100:")
print(df_missing.replace(1, 100))

Original DataFrame:
     A    B   C
0  1.0  5.0   9
1  2.0  NaN  10
2  NaN  NaN  11
3  4.0  8.0  12

After dropping rows with missing values:
     A    B   C
0  1.0  5.0   9
3  4.0  8.0  12

After filling missing values with 0:
     A    B   C
0  1.0  5.0   9
1  2.0  0.0  10
2  0.0  0.0  11
3  4.0  8.0  12

After replacing 1 with 100:
       A    B   C
0  100.0  5.0   9
1    2.0  NaN  10
2    NaN  NaN  11
3    4.0  8.0  12


## 5. Data Analysis

In [7]:
# Grouping and aggregation
df_group = pd.DataFrame({
    'Category': ['A', 'B', 'A', 'B', 'A'],
    'Value': [10, 20, 30, 40, 50]
})

print("Grouped sum:")
print(df_group.groupby('Category')['Value'].sum())

# Pivot tables
df_pivot = pd.DataFrame({
    'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],
    'Product': ['A', 'B', 'A', 'B'],
    'Sales': [100, 200, 150, 250]
})

print("\nPivot table:")
print(df_pivot.pivot(index='Date', columns='Product', values='Sales'))

Grouped sum:
Category
A    90
B    60
Name: Value, dtype: int64

Pivot table:
Product       A    B
Date                
2023-01-01  100  200
2023-01-02  150  250


## 6. Merging and Joining

In [8]:
# Create two DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value': [4, 5, 6]})

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Merge DataFrames
print("\nInner merge:")
print(pd.merge(df1, df2, on='key'))

print("\nOuter merge:")
print(pd.merge(df1, df2, on='key', how='outer'))

# Concatenate DataFrames
print("\nConcatenated DataFrames:")
print(pd.concat([df1, df2]))

DataFrame 1:
  key  value
0   A      1
1   B      2
2   C      3

DataFrame 2:
  key  value
0   A      4
1   B      5
2   D      6

Inner merge:
  key  value_x  value_y
0   A        1        4
1   B        2        5

Outer merge:
  key  value_x  value_y
0   A      1.0      4.0
1   B      2.0      5.0
2   C      3.0      NaN
3   D      NaN      6.0

Concatenated DataFrames:
  key  value
0   A      1
1   B      2
2   C      3
0   A      4
1   B      5
2   D      6
