# understanding pandas Data structure

In [3]:
# importing pandas library
import pandas as pd

# Series

Series  : A Series is a one-dimensional array-like object that can hold any data type (integer, string, float, etc.). It has an index that labels the data.

Creating a Series

In [8]:
# From a list
s1 = pd.Series([10, 20, 30, 40])

# From a dictionary
s2 = pd.Series({'a': 1, 'b': 2, 'c': 3})

print("Series from list:")
print(s1)
print("Series from dictionary:")
print(s2)

Series from list:
0    10
1    20
2    30
3    40
dtype: int64
Series from dictionary:
a    1
b    2
c    3
dtype: int64


# Dataframe

DataFrame : A DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns).

Creating a DataFrame

In [63]:
# From a dictionary of lists
data = {
    'Name': ['Alice', 'Bob', 'Charlie','Dervin','Michael'],
    'Age': [25, 30, 35,25,35],
    'City': ['New York', 'Los Angeles', 'Chicago','Tokyo','Delhi']
}
df = pd.DataFrame(data)

# From a list of dictionaries
data_1 = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'Los Angeles'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Chicago'}
]
df_1 = pd.DataFrame(data_1)

print("DataFrame from dictionary of lists:")
print(df)
print("DataFrame from list of dictionaries:")
print(df_1)

DataFrame from dictionary of lists:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3   Dervin   25        Tokyo
4  Michael   35        Delhi
DataFrame from list of dictionaries:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


# Viewing Data

In [26]:
# viewing first few rows and last few rows
print(df.head(3))     # viewing first 3 rows
print(df.tail(3))     # viewing last 3 rows

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
      Name  Age     City
2  Charlie   35  Chicago
3   Dervin   25    Tokyo
4  Michael   35    Delhi


 # selecting data, filtering rows, and modifying data.

In [32]:
# Selecting a column
print(df['Name'])
#Selecting multiple columns
print(df[['Name','City']])
#Selecting rows by index
print(df.iloc[1])
# selecting row by condition
print("Age graeter than 25")
print(df[df['Age']>25])

0      Alice
1        Bob
2    Charlie
3     Dervin
4    Michael
Name: Name, dtype: object
      Name         City
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago
3   Dervin        Tokyo
4  Michael        Delhi
Name            Bob
Age              30
City    Los Angeles
Name: 1, dtype: object
Age graeter than 25
      Name  Age         City
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
4  Michael   35        Delhi


# Data manipulation

In [65]:
#Adding new column
df['Salary']=[100000,200000,50000,25000,35000]
print(df)

      Name  Age         City  Salary
0    Alice   25     New York  100000
1      Bob   30  Los Angeles  200000
2  Charlie   35      Chicago   50000
3   Dervin   25        Tokyo   25000
4  Michael   35        Delhi   35000


In [67]:
#Drop a Column:
df.drop('City', axis=1,inplace=True)
print(df)

      Name  Age  Salary
0    Alice   25  100000
1      Bob   30  200000
2  Charlie   35   50000
3   Dervin   25   25000
4  Michael   35   35000


In [69]:
#Sort Data:
print("Sorted Data by salary")
print(df.sort_values(by='Salary'))  #Sorting data by salary

Sorted Data by salary
      Name  Age  Salary
3   Dervin   25   25000
4  Michael   35   35000
2  Charlie   35   50000
0    Alice   25  100000
1      Bob   30  200000


# Reading a csv file

In [74]:
df_csv = pd.read_csv('Diabetes Missing Data.csv')
print("DataFrame from CSV file:")
print(df_csv.head())  # Display the first few rows

DataFrame from CSV file:
   Pregnant  Glucose  Diastolic_BP  Skin_Fold  Serum_Insulin   BMI  \
0         6    148.0          72.0       35.0            NaN  33.6   
1         1     85.0          66.0       29.0            NaN  26.6   
2         8    183.0          64.0        NaN            NaN  23.3   
3         1     89.0          66.0       23.0           94.0  28.1   
4         0    137.0          40.0       35.0          168.0  43.1   

   Diabetes_Pedigree  Age  Class  
0              0.627   50      1  
1              0.351   31      0  
2              0.672   32      1  
3              0.167   21      0  
4              2.288   33      1  


# Handling Missing data

In [76]:
#Missing data in each columns
print("\nMissing Data in each column:")
print(df_csv.isnull().sum())


Missing Data in each column:
Pregnant               0
Glucose                5
Diastolic_BP          35
Skin_Fold            227
Serum_Insulin        374
BMI                   11
Diabetes_Pedigree      0
Age                    0
Class                  0
dtype: int64


In [78]:
#Total missing values
print("Tota Missing Data: ")
print(df_csv.isnull().sum().sum())

Tota Missing Data: 
652


# Fill missing values 

In [88]:
df_csv['Pregnant'].fillna(df_csv['Pregnant'].mean())
df_csv['Age'].fillna('Unknown')
df_csv['Glucose'].fillna(df_csv['Glucose'].mean()) 
print(df_csv)

     Pregnant  Glucose  Diastolic_BP  Skin_Fold  Serum_Insulin   BMI  \
0           6    148.0          72.0       35.0            NaN  33.6   
1           1     85.0          66.0       29.0            NaN  26.6   
2           8    183.0          64.0        NaN            NaN  23.3   
3           1     89.0          66.0       23.0           94.0  28.1   
4           0    137.0          40.0       35.0          168.0  43.1   
..        ...      ...           ...        ...            ...   ...   
763        10    101.0          76.0       48.0          180.0  32.9   
764         2    122.0          70.0       27.0            NaN  36.8   
765         5    121.0          72.0       23.0          112.0  26.2   
766         1    126.0          60.0        NaN            NaN  30.1   
767         1     93.0          70.0       31.0            NaN  30.4   

     Diabetes_Pedigree  Age  Class  
0                0.627   50      1  
1                0.351   31      0  
2                0.672  

# Remove duplicates

In [90]:
# Remove Duplicates
print("\nRemoving Duplicates:")
df_csv = df_csv.drop_duplicates()
print("DataFrame after removing duplicates:")
print(df_csv)


Removing Duplicates:
DataFrame after removing duplicates:
     Pregnant  Glucose  Diastolic_BP  Skin_Fold  Serum_Insulin   BMI  \
0           6    148.0          72.0       35.0            NaN  33.6   
1           1     85.0          66.0       29.0            NaN  26.6   
2           8    183.0          64.0        NaN            NaN  23.3   
3           1     89.0          66.0       23.0           94.0  28.1   
4           0    137.0          40.0       35.0          168.0  43.1   
..        ...      ...           ...        ...            ...   ...   
763        10    101.0          76.0       48.0          180.0  32.9   
764         2    122.0          70.0       27.0            NaN  36.8   
765         5    121.0          72.0       23.0          112.0  26.2   
766         1    126.0          60.0        NaN            NaN  30.1   
767         1     93.0          70.0       31.0            NaN  30.4   

     Diabetes_Pedigree  Age  Class  
0                0.627   50      1  
1 

# Converting Datatypes

In [92]:
df_csv['Age'] = df_csv['Age'].astype(int)
df_csv['Pregnant'] = df_csv['Pregnant'].astype(float)

print("DataFrame after type conversion:")
print(df_csv.dtypes)

DataFrame after type conversion:
Pregnant             float64
Glucose              float64
Diastolic_BP         float64
Skin_Fold            float64
Serum_Insulin        float64
BMI                  float64
Diabetes_Pedigree    float64
Age                    int32
Class                  int64
dtype: object


# Data Transformation

In [96]:
# Data Transformation
print("\nData Transformation:")

# Add a new column with transformed data (e.g., adding 10% to the salary)
df_csv['Glucose_2'] = df_csv['Glucose'] * 2.0

print("DataFrame with transformed data:")
print(df_csv)


Data Transformation:
DataFrame with transformed data:
     Pregnant  Glucose  Diastolic_BP  Skin_Fold  Serum_Insulin   BMI  \
0         6.0    148.0          72.0       35.0            NaN  33.6   
1         1.0     85.0          66.0       29.0            NaN  26.6   
2         8.0    183.0          64.0        NaN            NaN  23.3   
3         1.0     89.0          66.0       23.0           94.0  28.1   
4         0.0    137.0          40.0       35.0          168.0  43.1   
..        ...      ...           ...        ...            ...   ...   
763      10.0    101.0          76.0       48.0          180.0  32.9   
764       2.0    122.0          70.0       27.0            NaN  36.8   
765       5.0    121.0          72.0       23.0          112.0  26.2   
766       1.0    126.0          60.0        NaN            NaN  30.1   
767       1.0     93.0          70.0       31.0            NaN  30.4   

     Diabetes_Pedigree  Age  Class  Glucose_2  
0                0.627   50     

# Summary statistics

In [102]:
print("Summary Statistics:")
print(df.describe())  # Provides count, mean, std, min, 25%, 50%, 75%, max for numerical columns

# For categorical data
print("\nValue Counts for 'Name':")
print(df['Name'].value_counts())  # Count of unique values in 'Name' column

Summary Statistics:
        Age        Salary
count   5.0       5.00000
mean   30.0   82000.00000
std     5.0   71989.58258
min    25.0   25000.00000
25%    25.0   35000.00000
50%    30.0   50000.00000
75%    35.0  100000.00000
max    35.0  200000.00000

Value Counts for 'Name':
Name
Alice      1
Bob        1
Charlie    1
Dervin     1
Michael    1
Name: count, dtype: int64


# Grouping data 

In [107]:
# craeting new column of department
df['dept']=['IT','CSE','IT','CSE','IT']
print(df)
# Group by 'Department' and calculate mean salary
grouped = df.groupby('dept')['Salary'].mean()
print("\nAverage Salary by Department:")
print(grouped)

      Name  Age  Salary dept
0    Alice   25  100000   IT
1      Bob   30  200000  CSE
2  Charlie   35   50000   IT
3   Dervin   25   25000  CSE
4  Michael   35   35000   IT

Average Salary by Department:
dept
CSE    112500.000000
IT      61666.666667
Name: Salary, dtype: float64


# merging Dataframes

In [112]:
# Sample data
data1 = {
    'EmployeeID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
}
data2 = {
    'EmployeeID': [2, 3, 4],
    'Salary': [60000, 70000, 80000]
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
# Merge on 'EmployeeID'
merged_df = pd.merge(df1, df2, on='EmployeeID', how='inner')
print("\nMerged DataFrame (inner join):")
print(merged_df)
# Different join types: left, right, outer
print("\nLeft Join:")
print(pd.merge(df1, df2, on='EmployeeID', how='left'))

print("\nRight Join:")
print(pd.merge(df1, df2, on='EmployeeID', how='right'))

print("\nOuter Join:")
print(pd.merge(df1, df2, on='EmployeeID', how='outer'))


Merged DataFrame (inner join):
   EmployeeID     Name  Salary
0           2      Bob   60000
1           3  Charlie   70000

Left Join:
   EmployeeID     Name   Salary
0           1    Alice      NaN
1           2      Bob  60000.0
2           3  Charlie  70000.0

Right Join:
   EmployeeID     Name  Salary
0           2      Bob   60000
1           3  Charlie   70000
2           4      NaN   80000

Outer Join:
   EmployeeID     Name   Salary
0           1    Alice      NaN
1           2      Bob  60000.0
2           3  Charlie  70000.0
3           4      NaN  80000.0


# Joining Dataframes

In [116]:
data1 = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
data2 = {
    'Salary': [50000, 60000, 70000]
}
df1 = pd.DataFrame(data1,index=[1, 2, 3])
df2 = pd.DataFrame(data2,index=[1, 2, 3])
# Join DataFrames on index
joined_df = df1.join(df2)
print("\nJoined DataFrame:")
print(joined_df)


Joined DataFrame:
      Name  Age  Salary
1    Alice   25   50000
2      Bob   30   60000
3  Charlie   35   70000


# concatenating dataframes

In [118]:
# Sample data
data1 = {
    'Name': ['Alice', 'Bob'],
    'Age': [25, 30]
}
data2 = {
    'Name': ['Charlie', 'David'],
    'Age': [35, 40]
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Concatenate DataFrames row-wise
concat_df = pd.concat([df1, df2], axis=0)
print("\nConcatenated DataFrame (row-wise):")
print(concat_df)

# Concatenate DataFrames column-wise
data3 = {
    'Salary': [50000, 60000, 70000, 80000]
}
df3 = pd.DataFrame(data3)
concat_col_df = pd.concat([df1, df3], axis=1)
print("\nConcatenated DataFrame (column-wise):")
print(concat_col_df)


Concatenated DataFrame (row-wise):
      Name  Age
0    Alice   25
1      Bob   30
0  Charlie   35
1    David   40

Concatenated DataFrame (column-wise):
    Name   Age  Salary
0  Alice  25.0   50000
1    Bob  30.0   60000
2    NaN   NaN   70000
3    NaN   NaN   80000


# Advanteages of Pandas

Pandas is a powerful tool for data handling and analysis in Python, offering several advantages over traditional data structures:

>Enhanced Data Structures: DataFrames and Series provide labeled axes and efficient data manipulation.

>Performance: Fast, vectorized operations and memory efficiency compared to lists or dictionaries.

>Rich Functionality: Built-in methods for data cleaning, transformation, aggregation, and analysis.

>Data Manipulation: Easy merging, joining, and concatenation of datasets.

>Exploratory Data Analysis (EDA): Quick computation of summary statistics and grouping for pattern analysis.

# Real world applications of pandas

>Data Cleaning: Handling missing values in financial datasets.

>Exploratory Data Analysis: Summarizing and visualizing sales data to understand trends.

>Time Series Analysis: Manipulating stock price data for forecasting.

>Data Integration: Combining sales, marketing, and finance data for comprehensive analysis.

>Machine Learning: Preprocessing data for model training.