In [4]:
import pandas as pd
import os

In [11]:


# Set the path to your data file
data_path = r"E:\file_main\Major_1\fauna-forecast\data\raw\Bison_bison.csv"  
# Load the CSV file
df = pd.read_csv(data_path, encoding='latin1', low_memory=False)

# Show structure and preview
print("Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

# Null values per column
print("\nMissing values:\n", df.isnull().sum().sort_values(ascending=False).head(10))

# Preview first few rows
df.head(5)



Shape: (25092, 223)

Columns:
 ['gbifID', 'accessRights', 'bibliographicCitation', 'language', 'license', 'modified', 'publisher', 'references', 'rightsHolder', 'type', 'institutionID', 'collectionID', 'datasetID', 'institutionCode', 'collectionCode', 'datasetName', 'ownerInstitutionCode', 'basisOfRecord', 'informationWithheld', 'dataGeneralizations', 'dynamicProperties', 'occurrenceID', 'catalogNumber', 'recordNumber', 'recordedBy', 'recordedByID', 'individualCount', 'organismQuantity', 'organismQuantityType', 'sex', 'lifeStage', 'reproductiveCondition', 'caste', 'behavior', 'vitality', 'establishmentMeans', 'degreeOfEstablishment', 'pathway', 'georeferenceVerificationStatus', 'occurrenceStatus', 'preparations', 'disposition', 'associatedOccurrences', 'associatedReferences', 'associatedSequences', 'associatedTaxa', 'otherCatalogNumbers', 'occurrenceRemarks', 'organismID', 'organismName', 'organismScope', 'associatedOrganisms', 'previousIdentifications', 'organismRemarks', 'materialEnt

Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
0,2248494654,,,,CC_BY_4_0,,The International Barcode of Life Consortium,https://boldsystems.org/index.php/Public_Recor...,,,...,,CAN,Canada,CAN.2_1,British Columbia,CAN.2.22_1,Peace River,CAN.2.22.11_1,Peace River B,
1,2249805606,,,,CC_BY_4_0,,The International Barcode of Life Consortium,https://boldsystems.org/index.php/Public_Recor...,,,...,,CAN,Canada,CAN.1_1,Alberta,CAN.1.2_1,Division No. 11,CAN.1.2.12_1,Edmonton,
2,2249805916,,,,CC_BY_4_0,,The International Barcode of Life Consortium,https://boldsystems.org/index.php/Public_Recor...,,,...,,RUS,Russia,RUS.35_1,Krasnoyarsk,RUS.35.30_1,Khatangskiy rayon,RUS.35.30.1_1,,
3,2249805950,,,,CC_BY_4_0,,The International Barcode of Life Consortium,https://boldsystems.org/index.php/Public_Recor...,,,...,,CAN,Canada,CAN.13_1,Yukon,CAN.13.1_1,Yukon,CAN.13.1.35_1,"Yukon, Unorganized",
4,2249805693,,,,CC_BY_4_0,,The International Barcode of Life Consortium,https://boldsystems.org/index.php/Public_Recor...,,,...,,USA,United States,USA.17_1,Kansas,USA.17.99_1,Wabaunsee,,,


In [23]:
# Cleaning the data

# Drop fully empty columns
df.dropna(axis=1, how='all', inplace=True)

# Drop columns with only 1 unique value
df = df.loc[:, df.nunique() > 1]

# Convert 'eventDate' to datetime
#df['eventDate'] = pd.to_datetime(df['eventDate'], errors='coerce')

# Drop rows without essential spatial and temporal data
df = df.dropna(subset=['decimalLatitude', 'decimalLongitude', 'year'])
"""
# Drop if individualCount is negative or zero (optional)
if 'individualCount' in df.columns:
    df = df[df['individualCount'] > 0]
"""
# Keep only essential columns
keep_columns = ['year', 'month', 'day', 'decimalLatitude', 'decimalLongitude', 'individualCount']
df = df[[col for col in keep_columns if col in df.columns]]

# Sorting
df = df.sort_values(by=['year', 'month'], ascending=[True, True])

# Reset index
df.reset_index(drop=True, inplace=True)

# Show preview
print("Cleaned shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())
df.head(5)


Cleaned shape: (19501, 6)

Columns:
 ['year', 'month', 'day', 'decimalLatitude', 'decimalLongitude', 'individualCount']


Unnamed: 0,year,month,day,decimalLatitude,decimalLongitude,individualCount
0,1856.0,1.0,1.0,38.82081,-96.33162,
1,1868.0,12.0,,41.30518,-105.58424,1.0
2,1868.0,12.0,,41.30518,-105.58424,1.0
3,1868.0,12.0,,41.30518,-105.58424,1.0
4,1868.0,12.0,,41.30518,-105.58424,1.0


In [24]:
# Define cleaned output path
cleaned_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "data", "cleaned"))
os.makedirs(cleaned_dir, exist_ok=True)  # Create the folder if it doesn't exist

cleaned_path = os.path.join(cleaned_dir, "Bison_bison_cleaned.csv")

# Save to CSV
df.to_csv(cleaned_path, index=False, encoding='utf-8')

print(f" Cleaned CSV saved to:\n{cleaned_path}")


 Cleaned CSV saved to:
E:\file_main\Major_1\fauna-forecast\data\cleaned\Bison_bison_cleaned.csv
