In [10]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import chardet

In [11]:
data = pd.read_csv("globalterrorismdb_0718dist.csv",encoding='Windows-1252')

In [24]:
import dask.dataframe as dd
import time

# Function to clean data using Pandas
def clean_data_pandas(data):

    # Drop columns with more than 100000 null values
    columns = data.columns
    for col in columns:
        count_null = data[col].isnull().sum()
        if count_null >= 100000:
            data = data.drop(col, axis=1)

    # Check for duplicates and drop unnecessary columns
    data = data.drop_duplicates()
    necessary_columns = ['iyear', 'imonth', 'iday', 'extended', 'country_txt', 'region_txt', 'city', 'success', 'suicide', 'attacktype1_txt', 'targtype1_txt', 'target1', 'natlty1_txt', 'gname', 'individual', 'weaptype1_txt', 'nkill', 'property']
    data = data[necessary_columns]
    
     # Convert data types
        
    data['nkill'] = data['nkill'].fillna(0).astype(int)
    data['nwound'] = data['nwound'].fillna(0).astype(int)
    data['latitude'] = data['latitude'].astype(str)
    data['longitude'] = data['longitude'].astype(str)

    # Handle missing data
    data = data.dropna(subset=['country_txt', 'region_txt', 'attacktype1_txt', 'targtype1_txt'])
    data['latitude'] = data['latitude'].fillna(data['latitude'].mode())
    data['longitude'] = data['longitude'].fillna(data['longitude'].mode())
    data['nwound'] = data['nwound'].fillna(data['nwound'].mean())
    data['target1'] = data['target1'].fillna(data['target1'].mode())

   

    # Replace 0 values in 'imonth' and 'iday'
    data['imonth'] = data['imonth'].replace(0, 5)
    data['iday'] = data['iday'].replace(0, 15)

    # Convert text columns to consistent format
    columns_text = ['country_txt', 'region_txt', 'attacktype1_txt', 'targtype1_txt', 'target1', 'natlty1_txt', 'gname']
    for col in columns_text:
        data[col] = data[col].str.lower().str.strip()

    # Rename columns
    data = data.rename(columns={'iyear': 'Year', 'imonth': 'Month', 'iday': 'Day', 'extended': 'Extended', 'country_txt': 'Country', 'region_txt': 'Region', 'city': 'City', 'success': 'Success', 'suicide': 'Suicide', 'attacktype1_txt': 'AttackType', 'targtype1_txt': 'TargetType', 'target1': 'Target', 'natlty1_txt': 'Nationality', 'gname': 'Group', 'individual': 'Individual', 'weaptype1_txt': 'WeaponType', 'nkill': 'Killed', 'nwound': 'Wounded', 'property': 'Property'})

    return data


# Measure time for Pandas
start_time = time.time()
cleaned_data_pandas = clean_data_pandas(data)
end_time = time.time()
print(f"Pandas took {end_time - start_time:.2f} seconds")


cleaned_data_pandas.to_csv("cleaned_data_pandas.csv", index=False)


Pandas took 17.45 seconds


In [32]:
data = dd.read_csv("globalterrorismdb_0718dist.csv",encoding='Windows-1252',dtype={'approxdate': 'object',
       'attacktype2_txt': 'object',
       'attacktype3_txt': 'object',
       'claimmode2_txt': 'object',
       'claimmode3_txt': 'object',
       'corp2': 'object',
       'corp3': 'object',
       'divert': 'object',
       'doubtterr': 'float64',
       'gname2': 'object',
       'gname3': 'object',
       'gsubname': 'object',
       'gsubname2': 'object',
       'gsubname3': 'object',
       'hostkidoutcome_txt': 'object',
       'multiple': 'float64',
       'natlty1': 'float64',
       'natlty2_txt': 'object',
       'natlty3_txt': 'object',
       'ransom': 'float64',
       'ransomnote': 'object',
       'related': 'object',
       'target2': 'object',
       'target3': 'object',
       'targsubtype1': 'float64',
       'targsubtype2_txt': 'object',
       'targsubtype3_txt': 'object',
       'targtype2_txt': 'object',
       'targtype3_txt': 'object',
       'weapsubtype2_txt': 'object',
       'weapsubtype3_txt': 'object',
       'weaptype2_txt': 'object',
       'weaptype3_txt': 'object',
       'guncertain1': 'float64',
       'ishostkid': 'float64',
       'resolution': 'object',
       'specificity': 'float64',
       'weapsubtype4_txt': 'object',
       'weaptype4_txt': 'object'
                                                                                  })


def clean_data_dask(data):


    # Drop columns with more than 100000 null values
    columns = data.columns
    for col in columns:
        count_null = data[col].isnull().sum().compute()
        if count_null >= 100000:
            data = data.drop(col, axis=1)

    # Check for duplicates and drop unnecessary columns
    data = data.drop_duplicates()
    necessary_columns = ['iyear', 'imonth', 'iday', 'extended', 'country_txt', 'region_txt', 'city', 'success', 'suicide', 'attacktype1_txt', 'targtype1_txt', 'target1', 'natlty1_txt', 'gname', 'individual', 'weaptype1_txt', 'nkill', 'property']
    data = data[necessary_columns]

    # Handle missing data
    data = data.dropna(subset=['country_txt', 'region_txt', 'attacktype1_txt', 'targtype1_txt'])
    data['latitude'] = data['latitude'].fillna(data['latitude'].mode().compute()[0])
    data['longitude'] = data['longitude'].fillna(data['longitude'].mode().compute()[0])
    data['nwound'] = data['nwound'].fillna(data['nwound'].mean().compute())
    data['target1'] = data['target1'].fillna(data['target1'].mode().compute()[0])

    # Convert data types
    data['nkill'] = data['nkill'].fillna(0).astype(int)
    data['nwound'] = data['nwound'].fillna(0).astype(int)
    data['latitude'] = data['latitude'].astype(str)
    data['longitude'] = data['longitude'].astype(str)

    # Replace 0 values in 'imonth' and 'iday'
    data['imonth'] = data['imonth'].replace(0, 5)
    data['iday'] = data['iday'].replace(0, 15)

    # Convert text columns to consistent format
    columns_text = ['country_txt', 'region_txt', 'attacktype1_txt', 'targtype1_txt', 'target1', 'natlty1_txt', 'gname']
    for col in columns_text:
        data[col] = data[col].str.lower().str.strip()

    # Rename columns
    data = data.rename(columns={'iyear': 'Year', 'imonth': 'Month', 'iday': 'Day', 'extended': 'Extended', 'country_txt': 'Country', 'region_txt': 'Region', 'city': 'City', 'success': 'Success', 'suicide': 'Suicide', 'attacktype1_txt': 'AttackType', 'targtype1_txt': 'TargetType', 'target1': 'Target', 'natlty1_txt': 'Nationality', 'gname': 'Group', 'individual': 'Individual', 'weaptype1_txt': 'WeaponType', 'nkill': 'Killed', 'nwound': 'Wounded', 'property': 'Property'})

    return data

# Measure time for Dask
start_time = time.time()
cleaned_data_dask = clean_data_dask(data)
end_time = time.time()
print(f"Dask took {end_time - start_time:.2f} seconds")


Dask took 1466.96 seconds
