## EXPERIMENTS

IMPORT DATA 


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [None]:
df = pd.read_csv("C:/Users/USER/Desktop/ΓΙΑΝΝΗΣ/Msc BIG DATA ANALYTICS/THESIS/Crime_Data_from_2020.csv", encoding = "latin1", delimiter=';')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:

df['Date Rptd'] = pd.to_datetime(df['Date Rptd'].str.split().str[0], errors='coerce')
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'].str.split().str[0], errors='coerce')



In [None]:

df['LON'] = df['LON'].str.replace(r'\.(?=.*\.)', '', regex=True)
df['LON'] = pd.to_numeric(df['LON'], errors='coerce')



In [None]:
df['LAT'] = df['LAT'].apply(lambda x: x / 10 if abs(x) > 100 else x)  
df['LON'] = df['LON'].apply(lambda x: x / 10 if abs(x) > 200 else x)

In [None]:
from datetime import datetime

def convert_military_time_to_time(time_str):
    try:
        time_str = f"{time_str[:2]}:{time_str[2:]}"
        return datetime.strptime(time_str, '%H:%M').time()
    except ValueError:
        return None


df['TIME OCC'] = df['TIME OCC'].apply(lambda x: convert_military_time_to_time(str(x).zfill(4)))

In [None]:

columns_to_fill = [ 'Weapon Used Cd', 'Weapon Desc', 'Premis Desc', 'Cross Street', 'Mocodes']
df[columns_to_fill] = df[columns_to_fill].fillna('Unknown')


In [None]:
most_frequent_value = df['Crm Cd 1'].mode()[0]
df['Crm Cd 1'] = df['Crm Cd 1'].fillna(most_frequent_value)

In [None]:
df = df.drop(columns=['Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4'])

In [None]:
df['Month'] = df['Date Rptd'].dt.month 
df['Day of Week'] = df['Date Rptd'].dt.weekday + 1

In [None]:
def categorize_crime(crime_desc):
    if 'THEFT' in crime_desc:
        return 'Theft'
    elif 'ASSAULT' in crime_desc:
        return 'Assault'
    elif 'BURGLARY' in crime_desc:
        return 'Burglary'
    else:
        return 'Other'

df['Crime Category'] = df['Crm Cd Desc'].apply(categorize_crime)


def severity_crime(crime_desc):
    if 'MURDER' in crime_desc or 'RAPE' in crime_desc or 'ROBBERY' in crime_desc:
        return 'High'
    elif 'ASSAULT' in crime_desc or 'BURGLARY' in crime_desc:
        return 'Medium'
    else:
        return 'Low'

df['Crime Severity'] = df['Crm Cd Desc'].apply(severity_crime)


In [None]:
duplicates = df.duplicated()
num_duplicates = duplicates.sum()
print(f"Number of duplicate rows: {num_duplicates}")


In [None]:

df.rename(columns={
    'DR_NO': 'Report Number',
    'Date Rptd': 'Date Reported',
    'DATE OCC': 'Date Occurred',
    'TIME OCC': 'Time Occurred',
    'AREA': 'Area Code',
    'AREA NAME': 'Area Name',
    'Rpt Dist No': 'Report District Number',
    'Part 1-2': 'Crime Part',
    'Crm Cd': 'Crime Code',
    'Crm Cd Desc': 'Crime Description',
    'Mocodes': 'MO Codes',
    'Vict Age': 'Victim Age',
    'Vict Sex': 'Victim Sex',
    'Vict Descent': 'Victim Descent',
    'Premis Cd': 'Premises Code',
    'Premis Desc': 'Premises Description',
    'Weapon Used Cd': 'Weapon Used Code',
    'Weapon Desc': 'Weapon Description',
    'Status': 'Crime Status',
    'Status Desc': 'Status Description',
    'Crm Cd 1': 'Primary Crime Code',
    'LOCATION': 'Location',
    'Cross Street': 'Cross Street',
    'LAT': 'Latitude',
    'LON': 'Longitude'
}, inplace=True)


In [None]:
df['Crime Level'] = (df['Crime Code'] / 100).apply(np.floor)

In [None]:
df = df[df['Victim Age'] > 0]
age_bins = [0, 18, 35, 55, 100]  
age_labels = ['Child (0-18)', 'Young Adult (19-35)', 'Adult (36-55)', 'Senior (56+)']
df['Age Category'] = pd.cut(df['Victim Age'], bins=age_bins, labels=age_labels, right=False)


In [None]:
df = df.dropna(subset=['Victim Sex'])

In [None]:
df['Victim Sex'].unique()
sex_counts = df['Victim Sex'].value_counts()

threshold = 10000
sex_to_other = sex_counts[sex_counts < threshold].index


df['Victim Sex'] = df['Victim Sex'].replace(sex_to_other, 'Other')


In [None]:
df['Victim Descent'].unique()

descent_counts = df['Victim Descent'].value_counts()
threshold = 10000
descent_to_other = descent_counts[descent_counts < threshold].index

df['Victim Descent'] = df['Victim Descent'].replace(descent_to_other, 'Other')

In [None]:
df = df.dropna(subset=['Victim Descent','Premises Code'])

In [None]:
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Δημιουργία DataFrame με αριθμητικές στήλες
df_numerical = df[numerical_columns]

# Υπολογισμός μέσης τιμής και τυπικής απόκλισης για τις αριθμητικές στήλες
threshold = 2
mean = np.mean(df_numerical, axis=0)
std = np.std(df_numerical, axis=0)

# Εντοπισμός ακραίων τιμών
outliers = np.where(np.abs(df_numerical - mean) > threshold * std)

df_cleaned_numerical = df_numerical[(np.abs(df_numerical - mean) <= threshold * std).all(axis=1)]

# Δημιουργία ενός αντιγράφου του αρχικού DataFrame για τα καθαρισμένα δεδομένα
df_cleaned = df.copy()

# Αντικατάσταση των αριθμητικών στηλών με τις καθαρισμένες στήλες
df_cleaned[numerical_columns] = df_cleaned_numerical

# Ενημέρωση του αρχικού DataFrame με το καθαρισμένο DataFrame
df.update(df_cleaned)

In [None]:
df.info()

In [None]:

plt.figure(figsize=(12, 6))
sns.boxplot(x='Crime Severity', y='Victim Age', data=df)
plt.title('Crime Severity vs Victim Age')
plt.xlabel('Crime Severity')
plt.ylabel('Victim Age')
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x='Crime Severity', hue='Victim Sex', data=df)
plt.title('Crime Severity vs Victim Sex')
plt.xlabel('Crime Severity')
plt.ylabel('Count')
plt.legend(title='Victim Sex', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:

reported_freq = df['Date Reported'].value_counts().sort_index()
occurred_freq = df['Date Occurred'].value_counts().sort_index()
reported_freq_ma = reported_freq.rolling(window=7).mean()
occurred_freq_ma = occurred_freq.rolling(window=7).mean()

plt.figure(figsize=(10, 6))
reported_freq_ma.plot(kind='line', label='Date Reported (Moving Average)')
occurred_freq_ma.plot(kind='line', label='Date Occurred (Moving Average)')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.title('Distribution of Date Reported and Date Occurred (Moving Average)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:

df['Date Difference'] = (df['Date Reported'] - df['Date Occurred']).dt.days
    
plt.figure(figsize=(10, 6))
plt.hist(df['Date Difference'], bins=range(-50, 51), edgecolor='k')
plt.xlabel('Difference in Days')
plt.ylabel('Frequency')
plt.title('Difference Between Date Reported and Date Occurred')
plt.grid(True)
plt.show()

In [None]:
import folium
from folium.plugins import HeatMap

df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')
df = df.dropna(subset=['Latitude', 'Longitude'])

map_center = [df['Latitude'].mean(), df['Longitude'].mean()]
crime_map = folium.Map(location=map_center, zoom_start=12)

heat_data = [[row['Latitude'], row['Longitude']] for index, row in df.iterrows()]
HeatMap(heat_data).add_to(crime_map)


In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(y='Area Name', data=df, order=df['Area Name'].value_counts().index)
plt.title('Crime Distribution by Area')
plt.xlabel('Count')
plt.ylabel('Area')
plt.show()

In [None]:

plt.figure(figsize=(12, 6))
sns.histplot(df['Victim Age'], bins=30, kde=True)
plt.title('Distribution of Victim Age')
plt.xlabel('Victim Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Victim Sex', data=df, order=df['Victim Sex'].value_counts().index)
plt.title('Crime Distribution by Victim Sex')
plt.xlabel('Victim Sex')
plt.ylabel('Count')
plt.show()

In [None]:
df['Month'] = df['Date Reported'].dt.month_name()
df['Day of Week'] = df['Date Reported'].dt.day_name()

plt.figure(figsize=(12, 6))
sns.countplot(x='Month', data=df, order=df['Month'].value_counts().index)
plt.title('Crime Distribution by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Day of Week', data=df, order=df['Day of Week'].value_counts().index)
plt.title('Crime Distribution by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Victim Descent', data=df, order=df['Victim Descent'].value_counts().index)
plt.title('Crime Distribution by Victim Descent')
plt.xlabel('Victim Descent')
plt.ylabel('Count')
plt.show()

In [None]:

data_male = df[df['Victim Sex'] == 'M']['Victim Age'].dropna()
data_female = df[df['Victim Sex'] == 'F']['Victim Age'].dropna()

print(stats.shapiro(data_male))
print(stats.shapiro(data_female))

t_stat, p_value = stats.ttest_ind(data_male, data_female)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Απορρίπτουμε την μηδενική υπόθεση (H0). Υπάρχει στατιστικά σημαντική διαφορά μεταξύ των μέσων ηλικιών των θυμάτων.")
else:
    print("Δεν απορρίπτουμε την μηδενική υπόθεση (H0). Δεν υπάρχει στατιστικά σημαντική διαφορά μεταξύ των μέσων ηλικιών των θυμάτων.")

In [None]:
from statsmodels.formula.api import ols
import statsmodels.api as sm
model = ols('Q("Victim Age") ~ C(Q("Crime Category"))', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)

In [None]:
df['Month'] = df['Date Reported'].dt.month  
df['Day of Week'] = df['Date Reported'].dt.weekday + 1

In [None]:

df.to_csv('df_new.csv', index=False)

In [None]:
df.head()

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sm


df_encoded = pd.get_dummies(df[['Crime Category', 'Crime Severity']])

df_combined = pd.concat([df_encoded, df[['Crime Level','Primary Crime Code','Area Code','Premises Code','Month','Day of Week','Crime Part','Victim Age','Crime Code','Latitude','Longitude' ]]], axis=1)


scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_combined)

pca = PCA(n_components=7) 
pca_data = pca.fit_transform(scaled_data)

df_pca = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(7)])

explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()
print('Cumulative explained variance:', cumulative_explained_variance)

plt.figure(figsize=(10, 7))
plt.scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.5)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Data (2D)')
plt.show()



In [None]:
df_pca.head()

In [None]:

inertia = []
k_range = range(1, 10)

for k in k_range:
    kmeans = KMeans(n_clusters=k,n_init=10,random_state=42)
    kmeans.fit(df_pca)
    inertia.append(kmeans.inertia_)

# Οπτικοποίηση της μεθόδου Elbow
plt.figure(figsize=(10, 7))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.show()


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

optimal_clusters = 6

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df_pca['Cluster'] = kmeans.fit_predict(df_pca)


plt.figure(figsize=(10, 6))
plt.scatter(df_pca['PC1'], df_pca['PC2'], c=df_pca['Cluster'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title(f'K-means Clustering with {optimal_clusters} Clusters')
plt.show()


In [None]:
df_pca['Victim Sex'] = df['Victim Sex'].values


X = df_pca.drop(columns=['Victim Sex'])
y_cluster = df_pca['Cluster']
y_sex = df_pca['Victim Sex']

X_train_cluster, X_test_cluster, y_train_cluster, y_test_cluster = train_test_split(X, y_cluster, test_size=0.3, random_state=42)


X_train_sex, X_test_sex, y_train_sex, y_test_sex = train_test_split(X, y_sex, test_size=0.3, random_state=42)

classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42, max_depth=10, n_estimators=200),
    "SVM": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}


print("Classification for Cluster")
print("="*60)
for name, clf in classifiers.items():
    clf.fit(X_train_cluster, y_train_cluster)
    predictions = clf.predict(X_test_cluster)
    print(f"Classification report for Cluster with {name}:")
    print(classification_report(y_test_cluster, predictions))
    print("\n" + "="*60 + "\n")


print("Classification for Victim Sex")
print("="*60)
for name, clf in classifiers.items():
    clf.fit(X_train_sex, y_train_sex)
    predictions = clf.predict(X_test_sex)
    print(f"Classification report for Victim Sex with {name}:")
    print(classification_report(y_test_sex, predictions))
    print("\n" + "="*60 + "\n")
