<a href="https://colab.research.google.com/github/elenaajayi/Computer-Vision-Fundamentals---CUS-754/blob/main/solution_nlp_ELENA_AJAYI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Module 1: Data Loading and Exploring the Data

In [None]:
#Importing all necessary libararies
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Setup logging to provide robust error handling and info messages
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

logging.info("Starting Module 1: Data Ingestion & Initial Exploration")

def validate_columns(df, required_columns, df_name="DataFrame"):
    """
    Validate that the DataFrame contains all required columns.
    """
    missing = [col for col in required_columns if col not in df.columns]
    if missing:
        raise ValueError(f"{df_name} is missing required columns: {missing}")
    else:
        logging.info(f"{df_name} contains all required columns: {required_columns}")

# Load Risk Factors dataset
try:
    risk_factors = pd.read_csv('risk_factors.csv')
    validate_columns(risk_factors, ['english_keywords', 'keywords_arabic'], 'Risk Factors')
    logging.info("Risk Factors Data loaded successfully")
    logging.info(risk_factors.head())
except Exception as e:
    logging.error("Error loading risk_factors.csv: %s", e)
    raise

# Load Thematic Mapping dataset
try:
    thematic_mapping = pd.read_csv('thematic_mapping.csv')
    validate_columns(thematic_mapping, ['risk_factor', 'cluster'], 'Thematic Mapping')
    logging.info("Thematic Mapping loaded successfully")
    logging.info(thematic_mapping.head())
except Exception as e:
    logging.error("Error loading thematic_mapping.csv: %s", e)
    raise

# Load English news articles
try:
    news_articles_eng = pd.read_csv('news-articles-eng.csv')
    validate_columns(news_articles_eng, ['content', 'date', 'location_key'], 'English News Articles')
    logging.info("English News Articles loaded successfully")
    logging.info(news_articles_eng.head())
except Exception as e:
    logging.error("Error loading news-articles-eng.csv: %s", e)
    raise

# Load Arabic news articles
try:
    news_articles_ara = pd.read_csv('news-articles-ara.csv')
    validate_columns(news_articles_ara, ['content', 'date', 'location_key'], 'Arabic News Articles')
    logging.info("Arabic News Articles loaded successfully")
    logging.info(news_articles_ara.head())
except Exception as e:
    logging.error("Error loading news-articles-ara.csv: %s", e)
    raise

# Load Geographic Taxonomy files (assumed to be in pickle format)
try:
    with open('id_english_location_name.pkl', 'rb') as f:
        geo_english = pickle.load(f)
    logging.info("Geographic Taxonomy (English) loaded successfully")
    logging.info(geo_english)
except Exception as e:
    logging.error("Error loading id_english_location_name.pkl: %s", e)
    raise

try:
    with open('id_arabic_location_name.pkl', 'rb') as f:
        geo_arabic = pickle.load(f)
    logging.info("Geographic Taxonomy (Arabic) loaded successfully")
    logging.info(geo_arabic)
except Exception as e:
    logging.error("Error loading id_arabic_location_name.pkl: %s", e)
    raise



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
ERROR:root:Error loading risk_factors.csv: [Errno 2] No such file or directory: 'risk_factors.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'risk_factors.csv'

Module 2

In [None]:

logging.info("Starting Module 2: Data Cleaning & Preprocessing")

def clean_text(text, language='english'):
    """
    Clean text by converting to lowercase, removing punctuation,
    tokenizing, and removing stopwords.
    """
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)

    if language == 'english':
        stop_words = set(stopwords.words('english'))
    else:
        # Enhanced Arabic stopword list; this is a sample list.
        arabic_stopwords = {"في", "على", "من", "ما", "مع", "لا", "إلى", "عن", "أن", "هذا",
                            "و", "إلا", "لكن", "ذلك", "هذه", "هو", "هي", "هناك", "أو",
                            "إما", "لم", "لن", "قد", "بعد", "كما"}
        stop_words = arabic_stopwords

    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Clean the 'english_keywords' in risk factors
risk_factors['clean_english_keywords'] = risk_factors['english_keywords'].apply(lambda x: clean_text(x, language='english'))
logging.info("Cleaned Risk Factors (English Keywords):")
logging.info(risk_factors[['english_keywords', 'clean_english_keywords']].head())

# Improved translation: Use googletrans if available; else, use dummy translation.
try:
    from googletrans import Translator
    translator = Translator()

    def translate_to_arabic(text):
        """
        Translate English text to Arabic using googletrans.
        If translation fails, fall back to dummy translation.
        """
        try:
            translated = translator.translate(text, dest='ar')
            return translated.text
        except Exception as e:
            logging.error("Translation error for text '%s': %s", text, e)
            return text + '_ara'
except ImportError:
    logging.warning("googletrans library not found. Falling back to dummy translation.")
    def translate_to_arabic(text):
        return text + '_ara'

# Populate the Arabic keywords column if it is empty
risk_factors['keywords_arabic'] = risk_factors.apply(
    lambda row: translate_to_arabic(row['english_keywords'])
                if pd.isna(row['keywords_arabic']) or row['keywords_arabic'].strip() == ''
                else row['keywords_arabic'],
    axis=1
)
logging.info("Risk Factors with Arabic Keywords (After Translation):")
logging.info(risk_factors[['english_keywords', 'keywords_arabic']].head())

# Preprocess news articles content
if 'content' in news_articles_eng.columns:
    news_articles_eng['clean_content'] = news_articles_eng['content'].apply(lambda x: clean_text(x, language='english'))
    logging.info("Sample Cleaned Content for English News Articles:")
    logging.info(news_articles_eng[['content', 'clean_content']].head())

if 'content' in news_articles_ara.columns:
    news_articles_ara['clean_content'] = news_articles_ara['content'].apply(lambda x: clean_text(x, language='arabic'))
    logging.info("Sample Cleaned Content for Arabic News Articles:")
    logging.info(news_articles_ara[['content', 'clean_content']].head())

def map_location(key, geo_dict):
    """
    Map a geographic key to its corresponding location name.
    """
    return geo_dict.get(key, 'Unknown')

if 'location_key' in news_articles_eng.columns:
    news_articles_eng['location_name'] = news_articles_eng['location_key'].apply(lambda x: map_location(x, geo_english))
    logging.info("English News Articles with Mapped Location Names:")
    logging.info(news_articles_eng[['location_key', 'location_name']].head())

if 'location_key' in news_articles_ara.columns:
    news_articles_ara['location_name'] = news_articles_ara['location_key'].apply(lambda x: map_location(x, geo_arabic))
    logging.info("Arabic News Articles with Mapped Location Names:")
    logging.info(news_articles_ara[['location_key', 'location_name']].head())er steps.")
