# Creating a data frame

In [None]:
import pandas as pd

# Creating a DataFrame from a dictionary of lists
# each key of the dictionary becomes the column header and values as columns
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

print("DataFrame:")
print(df)


# Reading data from CSV file

In [None]:
import pandas as pd

# Reading a CSV file into a DataFrame
df = pd.read_csv('data.csv')

print("DataFrame from CSV:")
print(df.head())


# DataFrame Information and Statistics

In [None]:

import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [70000, 80000, 120000]
}
df = pd.DataFrame(data)

# Basic information about the DataFrame
print("DataFrame Info:")
print(df.info())

# Summary statistics, applicable to numerical values
print("\nSummary Statistics:") 
print(df.describe())

# selected summary statistics
print('selected summary statistics:')
df.agg(
    {
        'Age':['min', 'max', 'median'],
        'Salary' :['min', 'max', 'median'],
    }
)



## Selecting Data

In [None]:
import pandas as pd
# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [70000, 80000, 120000]
}
df = pd.DataFrame(data)

# Selecting a single column
print("Select 'Name' column:")
print(df['Name'])

# Selecting multiple columns
print("\nSelect 'Name' and 'Age' columns:")
print(df[['Name', 'Age']])

# Selecting rows based on condition / Boolean masking
print("\nSelect rows where Age > 30:")
print(df[df['Age'] > 30])

print("\nSelect rows where Age >= 30 and Salary is greater than 75k:")
print(df[(df['Age'] >= 30) & (df['Salary'] > 75000)])  
# enclose each condition in parantheses.


# Selecting rows or columns using names 
print("\nSelect column Name that satisfies the specified condition")
print(df.loc[ df['Age'] > 30, 'Name'])


# Selecting rows by index
print("\nSelect first two rows:")
print(df.iloc[:3])

print("\nSelect first two rows and last two columns:")
print(df.iloc[:3, :1])




## Adding and Removing Columns

In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
df = pd.DataFrame(data)

# Adding a new column
df['Salary'] = [70000, 80000, 120000]

print("DataFrame with new column:")
print(df)

# Removing a column
df = df.drop('Salary', axis=1)

print("\nDataFrame after removing 'Salary' column:")
print(df)


## Group By and Aggregation

In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'Department': ['HR', 'Engineering', 'HR', 'Engineering'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David'],
    'Salary': [70000, 80000, 75000, 85000]
}
df = pd.DataFrame(data)

# Group by 'Department' and calculate the average salary (split-apply-combine)
print(df.groupby('Department'))
grouped_df = df.groupby('Department')['Salary'].mean()

print("Average Salary by Department:")
print(grouped_df)


## Count number of records by category

In [None]:
df['Department'].value_counts()

## Handling Missing Data

In [None]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', np.nan],
    'Age': [25, np.nan, 35],
    'Salary': [70000, 80000, np.nan]
})

print("Original DataFrame with missing values:")
print(df)

# Filling missing values
df_filled = df.fillna({
    'Name': 'Unknown',
    'Age': df['Age'].mean(),
    'Salary': df['Salary'].mean()
})

print("\nDataFrame after filling missing values:")
print(df_filled)

# Dropping rows with missing values
df_dropped = df.dropna()

print("\nDataFrame after dropping rows with missing values:")
print(df_dropped)


## Sorting data

In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [55, 30, 35],
    'Salary': [70000, 80000, 120000]
}
df = pd.DataFrame(data)

# Sorting by a single column
sorted_df = df.sort_values(by='Salary', ascending=False)

print("DataFrame sorted by Salary (descending):")
print(sorted_df)

# Sorting by multiple columns
sorted_df = df.sort_values(by=[ 'Age','Salary'], ascending=False)

print("DataFrame sorted by Salary and Age(descending):")
print(sorted_df)



# Applying Functions

In [None]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Salary': [70000, 80000, 120000]
})

# Applying a function to a column
df['Salary_in_Thousand'] = df['Salary'].apply(lambda x: x / 1000)

print("DataFrame with applied function:")
print(df)


In [None]:
import pandas as pd

# Sample DataFrame
data = {
    'Brand': ['Toyota', 'Honda', 'Ford', 'BMW', 'Toyota', 'Honda', 'Ford', 'BMW'],
    'Region': ['North America', 'North America', 'Europe', 'Europe', 'Asia', 'Asia', 'North America', 'Asia'],
    'Sales': [15000, 12000, 10000, 8000, 7000, 6000, 11000, 9000],
    'Revenue': [30000000, 25000000, 20000000, 40000000, 35000000, 36000000, 22000000, 18000000]
}

df = pd.DataFrame(data)

# Group by Brand and Region, then aggregate Sales and Revenue
summary_df = df.groupby(['Brand', 'Region']).agg(
    Total_Sales=('Sales', 'sum'),
    Total_Revenue=('Revenue', 'sum')
)

print(summary_df)
