# Global College Statistics Analysis

This notebook analyzes the Global College Statistics Dataset to provide insights about educational institutions worldwide.

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]

## Data Loading and Initial Exploration

In [None]:
# Read the dataset
df = pd.read_csv('College Data.csv')

# Display basic information about the dataset
print("Dataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

## 1. Country Analysis

In [None]:
# Calculate country statistics
country_stats = df.groupby('Country').agg({
    'Total Students': 'sum',
    'Placement Rate': 'mean',
    'CGPA': 'mean',
    'Research Papers Published': 'mean',
    'Faculty Count': 'mean'
}).round(2)

# Display top 10 countries by total students
country_stats_sorted = country_stats.sort_values('Total Students', ascending=False)
country_stats_sorted.head(10)

## 2. Branch Analysis

In [None]:
# Calculate branch performance metrics
branch_stats = df.groupby('Branch').agg({
    'Total Students': 'sum',
    'Placement Rate': 'mean',
    'CGPA': 'mean',
    'Research Papers Published': 'mean'
}).round(2)

# Plot branch performance
plt.figure(figsize=(15, 8))
sns.scatterplot(data=df, x='CGPA', y='Placement Rate', hue='Branch', alpha=0.6)
plt.title('CGPA vs Placement Rate by Branch')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 3. Income Analysis

In [None]:
# Create income brackets and analyze
df['Income_Bracket'] = pd.qcut(df['Annual Family Income'], q=5, 
                              labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

# Analyze performance metrics by income bracket
income_analysis = df.groupby('Income_Bracket').agg({
    'CGPA': 'mean',
    'Placement Rate': 'mean',
    'Research Papers Published': 'mean'
}).round(2)

# Plot CGPA distribution by income
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Income_Bracket', y='CGPA')
plt.title('CGPA Distribution by Income Bracket')
plt.xticks(rotation=45)
plt.show()

## 4. Gender Analysis

In [None]:
# Analyze gender distribution by branch
gender_branch = df.groupby('Branch').agg({
    'Male': 'sum',
    'Female': 'sum'
})
gender_branch['Female_Percentage'] = (gender_branch['Female'] / 
                                     (gender_branch['Male'] + gender_branch['Female']) * 100).round(2)

# Plot gender distribution
plt.figure(figsize=(15, 8))
sns.barplot(data=gender_branch.reset_index(), x='Branch', y='Female_Percentage')
plt.title('Female Representation by Branch (%)')
plt.xticks(rotation=45)
plt.show()

## 5. Research Impact Analysis

In [None]:
# Analyze research impact
research_corr = df[['Research Papers Published', 'Placement Rate', 
                    'CGPA', 'Faculty Count']].corr()['Research Papers Published']

# Plot research impact on placement
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='Research Papers Published', y='Placement Rate', 
            scatter_kws={'alpha':0.5})
plt.title('Impact of Research Papers on Placement Rate')
plt.show()

## 6. Faculty Analysis

In [None]:
# Calculate and analyze student-faculty ratio
df['Student_Faculty_Ratio'] = df['Total Students'] / df['Faculty Count']
faculty_analysis = df.groupby('Branch')['Student_Faculty_Ratio'].agg(['mean', 'min', 'max']).round(2)

# Plot impact on CGPA
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Student_Faculty_Ratio', y='CGPA', alpha=0.5)
plt.title('Impact of Student-Faculty Ratio on CGPA')
plt.show()

## Summary of Key Findings

In [None]:
print("Key Findings:")
print(f"Total number of colleges analyzed: {len(df)}")
print(f"Average placement rate: {df['Placement Rate'].mean():.2f}%")
print(f"Average CGPA: {df['CGPA'].mean():.2f}")
print(f"Country with most colleges: {df['Country'].value_counts().index[0]}")
print(f"Most common branch: {df['Branch'].value_counts().index[0]}")
print(f"Average Student-Faculty Ratio: {df['Student_Faculty_Ratio'].mean():.2f}")