In [200]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [201]:
df = pd.read_csv(r"C:\Users\ACER\Downloads\archive\Dog_Bites_Data.csv")

In [202]:
df

Unnamed: 0,UniqueID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224
3,4,January 08 2018,DOG,Mixed/Other,4,M,False,Brooklyn,11231
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224
...,...,...,...,...,...,...,...,...,...
22658,10276,December 24 2017,DOG,CHIWEENIE MIX,7,M,True,Staten Island,10303
22659,10277,December 24 2017,DOG,DUNKER,5,F,True,Staten Island,10303
22660,10278,December 21 2017,DOG,"Schnauzer, Miniature",10M,M,True,Staten Island,10312
22661,10279,December 28 2017,DOG,Mixed/Other,,F,False,Staten Island,10308


EDA

In [203]:
#shape type and all
df = df.drop('UniqueID',axis = 1)

In [204]:

# Define a mapping for common breed variations
breed_mapping = {
    'pit bull': ['pitbull', 'pit bull terrier'],
    'bulldog': ['bull dog', 'bulldog english'],
    'german shepherd': ['german shepard', 'alsatian']
}

def clean_breed_enhanced(breed):
    breed = str(breed).strip().lower()
    if breed in ['unknown', 'u']:
        return 'unknown'
    # Remove special characters and standardize separators
    breed = re.sub(r'[^a-z\s]', ' ', breed)
    breed = re.sub(r'\s+', ' ', breed).strip()

    # Map to standardized breed name
    for standard_name, variants in breed_mapping.items():
        if any(variant in breed for variant in variants) or standard_name in breed:
            return standard_name
    return breed

def clean_age_robust(age):
    age_str = str(age).strip().lower()
    if age_str in ['unknown', 'u', 'nan']:
        return None
    
    # Regex to find number and unit
    match = re.search(r'(\d*\.?\d+)\s*([ym]?)\s*(?:years?|yrs?|months?|mths?)?', age_str)
    if match:
        num, unit = float(match.group(1)), match.group(2)
        if unit == 'm' or 'month' in age_str:
            return round(num / 12, 2)  # Convert months to years
        return int(num)  # Return years as integer
    return None


df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Standardize data formats
df['date'] = pd.to_datetime(df['DateOfBite'], errors='coerce')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df = df[df['Age'] >= 0]  # Validate data integrity


# Clean text columns
text_columns = ['Breed', 'other_text_column']
for col in text_columns:
    df[col] = df[col].str.strip().str.lower()


In [205]:
df.to_csv(r"C:\Users\ACER\Downloads\archive\q11.csv", index=False)

In [206]:
df['Breed'] = df['Breed'].apply(clean_breed_enhanced)
df['Age'] = df['Age'].apply(clean_age_robust)

In [207]:
df['Breed'].value_counts()
# df['Age'].value_counts()

Breed
pit bull             5457
unknown              2349
nan                  2218
german shepherd       739
shih tzu              732
                     ... 
russell terrier         1
beagle x basset         1
coonhound x             1
pit dalmatian mix       1
lab coo hound           1
Name: count, Length: 1256, dtype: int64

In [208]:
df.loc[df['Breed']=='unknown dogs','Breed'] = 'unknown'

In [209]:
# Convert to numeric (handles string floats)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Apply filter with assignment
df = df[~df['Age'].isin([2018.00, 2022.00])]

# Verify remova
df['Age'].unique()

array([  nan,  4.  ,  5.  ,  3.  ,  7.  ,  6.  ,  8.  , 11.  , 13.  ,
        2.  ,  0.83,  1.  ,  0.75,  9.  , 10.  ,  0.67,  0.33, 12.  ,
       15.  ,  0.92,  0.58, 14.  ,  0.42,  0.25, 16.  ,  0.5 ,  0.17,
       19.  ,  1.08,  1.17, 20.  ,  1.33,  1.75, 17.  , 68.  ,  1.5 ,
        1.42,  0.08,  1.25,  1.83, 41.  ,  0.  , 21.  , 18.  ])

In [210]:
df = df[~df['ZipCode'].isin(['?'])]
df['ZipCode'].unique()

array(['11220', nan, '11224', '11231', '11233', '11235', '11208', '11215',
       '11238', '11207', '11205', '11209', '11237', '11217', '11236',
       '11234', '11214', '11230', '11211', '11225', '11210', '11223',
       '11232', '11226', '11201', '11212', '11216', '11239', '11219',
       '11218', '11206', '11229', '11222', '11203', '11204', '11221',
       '11249', '11213', '112', '11228', '111208', '112238', '1122',
       '112006', '10473', '10456', '10454', '10460', '10459', '10468',
       '10465', '10462', '10466', '10472', '10470', '10461', '10471',
       '10451', '10467', '10458', '10455', '10464', '10453', '10475',
       '10457', '10452', '10469', '10463', '10474', '100467', '10482',
       '104', '10987', '10549', '466', '10495', '10039', '10421', '10065',
       '10010', '10031', '10029', '10026', '10016', '10003', '10032',
       '10128', '10001', '10022', '10021', '10040', '10023', '10011',
       '10027', '10024', '10034', '10013', '10009', '10025', '10004',
       '1

In [211]:
df['DateOfBite'] = pd.to_datetime(df['DateOfBite'],errors = 'coerce')
df['Breed'] = df['Breed'].astype(str,errors = 'ignore')
df['Age'] = pd.to_numeric(df['Age'],errors = 'coerce')
df['ZipCode'] = pd.to_numeric(df['ZipCode'],errors = 'coerce')

In [212]:
# df.to_csv(r'C:\Users\ACER\Downloads\archive\q12.csv')

In [213]:
df

Unnamed: 0,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,2018-01-01,DOG,unknown,,U,False,Brooklyn,11220.0
1,2018-01-04,DOG,unknown,,U,False,Brooklyn,
2,2018-01-06,DOG,pit bull,,U,False,Brooklyn,11224.0
3,2018-01-08,DOG,mixed other,4.00,M,False,Brooklyn,11231.0
4,2018-01-09,DOG,pit bull,,U,False,Brooklyn,11224.0
...,...,...,...,...,...,...,...,...
22658,2017-12-24,DOG,chiweenie mix,7.00,M,True,Staten Island,10303.0
22659,2017-12-24,DOG,dunker,5.00,F,True,Staten Island,10303.0
22660,2017-12-21,DOG,schnauzer miniature,0.83,M,True,Staten Island,10312.0
22661,2017-12-28,DOG,mixed other,,F,False,Staten Island,10308.0


In [214]:
df.drop_duplicates(inplace=True)
# df.dropna(inplace=True)
# df.loc[df['Age'].isna(),'Age'] = df['Age'].median()
# df.loc[df['ZipCode'].isna(),'ZipCode'] = df['ZipCode'].median()
imputer = SimpleImputer(strategy = 'median')
df[['Age','ZipCode']] = imputer.fit_transform(df[['Age','ZipCode']])

In [216]:
df['Age'].isna().sum()
df['ZipCode'].isna().sum()

np.int64(0)

In [219]:
#univariate, bivariate
df['Age'].var(),df['ZipCode'].cov(df['Age']),df['Age'].std(),df['ZipCode'].corr(df['Age'])


(np.float64(6.091227905912257),
 np.float64(-65.38881662682122),
 np.float64(2.468041309604087),
 np.float64(-0.012683574581275125))

In [221]:
def iqr_outlier_detection(series):
    """Detect outliers using IQR method"""

    # Calculate quartiles and IQR
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1

    # Define outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = series[(series < lower_bound) | (series > upper_bound)]

    return {
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'outliers': outliers,
        'outlier_count': len(outliers),
        'outlier_percentage': (len(outliers) / len(series)) * 100
    }
data = pd.Series([10, 12, 12, 13, 12, 11, 14, 13, 15, 10, 100])  # 100 is outlier
result = iqr_outlier_detection(data)
result

{'Q1': np.float64(11.5),
 'Q3': np.float64(13.5),
 'IQR': np.float64(2.0),
 'lower_bound': np.float64(8.5),
 'upper_bound': np.float64(16.5),
 'outliers': 10    100
 dtype: int64,
 'outlier_count': 1,
 'outlier_percentage': 9.090909090909092}

In [226]:
df.to_csv(r'C:\Users\ACER\Downloads\archive\q14.csv',index = False)

In [231]:
#boxplot
# Ensure 'DateOfBite' is in datetime format first
df['DateOfBite'] = pd.to_datetime(df['DateOfBite'], errors='coerce')

# Now set it as the index and sort
df.set_index('DateOfBite', inplace=True)
df.sort_index(inplace=True)
df.set_index('DateOfBite')['01-01-2018':'18-01-2018']['Breed'].plot(kind = 'bar')

KeyError: 'DateOfBite'

In [232]:
x = df[['experience','age','education','projects']]
y = df['salary']
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size =0.2,random_state = 42)
lr = LinearRegression()
lr.fit(train_x,train_y)
pred = lr.predict(test_x)

mse = mean_squared_error(test_y, pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_y, pred)
r2 = r2_score(test_y, pred)
print(mse,rmse,mae,r2)

classification_report(test_y,pred)

KeyError: "None of [Index(['experience', 'age', 'education', 'projects'], dtype='object')] are in the [columns]"

In [233]:
import pandas as pd

# Sample data with target
df = pd.DataFrame({
    'category': ['A', 'B', 'A', 'C', 'B', 'C', 'A'],
    'target': [10, 20, 15, 5, 25, 8, 12]
})

# Target encoding
target_means = df.groupby('category')['target'].mean()
df['category_encoded'] = df['category'].map(target_means)
print(df)


  category  target  category_encoded
0        A      10         12.333333
1        B      20         22.500000
2        A      15         12.333333
3        C       5          6.500000
4        B      25         22.500000
5        C       8          6.500000
6        A      12         12.333333
