<a href="https://colab.research.google.com/github/emuduko/0760902445/blob/main/diffused_knowledge_graphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("uganda_farming_dataset.csv")
df.head()

In [None]:
!pip install pandas numpy matplotlib scikit-learn networkx torch torch-geometric


In [None]:
#Remove duplicates
df.drop_duplicates(inplace = True)

In [None]:
#Trim whitespace and standardize text case
for col in ['Region', 'Crop_Type', 'Recommended_Fertilizer']:
    df[col] = df[col].astype(str).str.strip().str.title()

In [None]:
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:
        df[col] = df[col].fillna(df[col].mean())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])


In [None]:
#detect and handle outliers (using z-score filtering)
from scipy import stats
numeric_cols = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]  # keep only rows within 3 standard deviations

print("Data Cleaning & Harmonization Done")
print(df.head())

In [None]:
#FEATURE PREPARATION FOR KNOWLEDGE GRAPH

#create unique IDs for categorical entities
df['Region_ID'] = df['Region'].astype('category').cat.codes
df['Crop_ID'] = df['Crop_Type'].astype('category').cat.codes
df['Fert_ID'] = df['Recommended_Fertilizer'].astype('category').cat.codes

#define entities
soil_features = ['Soil_pH','Nitrogen','Phosphorus','Potassium']
env_features = ['Rainfall','Temperature','Humidity']
crop_features = ['Leaf_Chlorophyll','Pest_Infestation']

#prepare node tables
regions = df[['Region_ID','Region']].drop_duplicates()
crops = df[['Crop_ID', 'Crop_Type']].drop_duplicates()
fertilizers = df[['Fert_ID', 'Recommended_Fertilizer']].drop_duplicates()
soils = df[soil_features]
envs = df[env_features]

#Generate relations (edges)
edges = []
for _, row in df.iterrows():
    edges += [
        ('Region_' + str(row['Region_ID']), 'Soil_' + str(_), 'has'),
        ('Region_' + str(row['Region_ID']), 'Env_' + str(_), 'experiences'),
        ('Soil_' + str(_), 'Crop_' + str(row['Crop_ID']), 'affects'),
        ('Env_' + str(_), 'Crop_' + str(row['Crop_ID']), 'influences'),
        ('Crop_' + str(row['Crop_ID']), 'Fert_' + str(row['Fert_ID']), 'requires')
    ]

edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Relation'])

print("Entity and Relationship Tables Ready")
print("Entities: Region, Crop, Fertilizer, Soil, Environment")
print(edges_df.head())

In [None]:
#NUMERICAL SCALING & ENCODING

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import joblib

#Encode categorical features
label_encoders = {}
for col in ['Region', 'Crop_Type', 'Recommended_Fertilizer']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

#Normalize numeric features
scaler = MinMaxScaler()
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = scaler.fit_transform(df[num_cols])

#Save encoders and scalers (for reuse during deployment)
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')

#Export preprocessed dataset
df.to_csv('encoded_df.csv', index=False)

print("Encoding & Scaling Completed. File saved as encoded_df.csv")
df.head()


In [None]:
# STEP 1. GRAPH CONSTRUCTION (NetworkX)

import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

# Load encoded data
df = pd.read_csv("encoded_df.csv")

# Build an undirected Knowledge Graph
G = nx.Graph()

# Add nodes with attributes
for _, row in df.iterrows():
    # Create node names
    region = f"Region_{int(row['Region'])}"
    crop = f"Crop_{int(row['Crop_Type'])}"
    fert = f"Fert_{int(row['Recommended_Fertilizer'])}"
    soil = f"Soil_{_}"
    env = f"Env_{_}"

    # Add nodes with features
    G.add_node(region, type="Region")
    G.add_node(crop, type="Crop")
    G.add_node(fert, type="Fertilizer")
    G.add_node(soil, type="Soil", pH=row['Soil_pH'], N=row['Nitrogen'], P=row['Phosphorus'], K=row['Potassium'])
    G.add_node(env, type="Environment", Rainfall=row['Rainfall'], Temperature=row['Temperature'], Humidity=row['Humidity'])

    # Add relationships (edges)
    G.add_edges_from([
        (region, soil, {'relation': 'has'}),
        (region, env, {'relation': 'experiences'}),
        (soil, crop, {'relation': 'affects'}),
        (env, crop, {'relation': 'influences'}),
        (crop, fert, {'relation': 'requires'})
    ])

print(f"Graph built with {len(G.nodes())} nodes and {len(G.edges())} edges.")

# Visualize small subgraph (optional)
subset = list(G.nodes)[:40]
H = G.subgraph(subset)
plt.figure(figsize=(12, 8))
nx.draw(H, with_labels=True, node_size=500, font_size=7)
plt.show()


In [None]:
!pip install node2vec
