<a href="https://colab.research.google.com/github/darvesh-sd/Copy-of-TPSessions.ipynb/blob/main/TP_4_No_1_Projet_Gutenberg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install Required Libraries**

In [50]:
# Install required libraries
!pip install pandas wordcloud matplotlib nltk



**Import Libraries and Set Up NLTK**

In [51]:
# Import essential libraries
import pandas as pd
import zipfile
import os
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Upload the Facebook ZIP File**

In [52]:
# Upload your Facebook ZIP file
from google.colab import files
uploaded = files.upload()  # A file upload dialog will appear


Saving meta-2024-Dec-05-12-44-01-20241205T094019Z-001.zip to meta-2024-Dec-05-12-44-01-20241205T094019Z-001.zip


**Extract the ZIP File**

In [54]:
# Extract the ZIP file into a folder named 'facebook_data'
zip_file_name = list(uploaded.keys())[0]  # Get the uploaded file name
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall("facebook_data")  # Extract contents to 'facebook_data' folder
print("ZIP file extracted successfully!")

# List all extracted files to locate CSV files
csv_files = []
for root, dirs, files in os.walk("facebook_data"):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))
print(f"CSV files found: {csv_files}")


ZIP file extracted successfully!
CSV files found: []


**Process Each CSV File**

In [55]:
# Define a function to clean text
def clean_text(text):
    """Removes URLs, special characters, and converts text to lowercase."""
    if isinstance(text, str):
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'\W+', ' ', text)    # Remove special characters
        text = text.lower()                 # Convert to lowercase
    return text

# Set up stopwords
stop_words = set(stopwords.words('english'))

# Initialize a list to store all cleaned tokens
all_tokens = []

# Process each CSV file
for csv_file in csv_files:
    try:
        # Load the CSV file
        df = pd.read_csv(csv_file)
        print(f"\nProcessing file: {csv_file}")
        print("Columns in this file:", df.columns)

        # Identify the column containing text data
        # Customize the list based on Facebook data structure
        potential_columns = ['Message', 'Post', 'Content', 'Body']
        text_column = None
        for col in potential_columns:
            if col in df.columns:
                text_column = col
                break

        if text_column:
            print(f"Using column '{text_column}' for text data.")

            # Filter out rows with missing text
            df = df[df[text_column].notnull()]

            # Clean the text
            df['Cleaned_Content'] = df[text_column].apply(clean_text)

            # Tokenize and remove stopwords
            df['Tokens'] = df['Cleaned_Content'].apply(
                lambda text: [word for word in word_tokenize(text) if word not in stop_words]
            )

            # Append tokens from this file to the global list
            for tokens in df['Tokens']:
                all_tokens.extend(tokens)
        else:
            print(f"No suitable text column found in {csv_file}. Skipping...")
    except Exception as e:
        print(f"Error processing {csv_file}: {e}")


**Generate the Word Cloud**

In [56]:
# Combine all tokens into a single string
all_text = ' '.join(all_tokens)

# Generate the word cloud
if all_text:
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
    print("Word cloud generated successfully!")
else:
    print("No text data available for word cloud.")


No text data available for word cloud.


**Display the Word Cloud**

In [57]:
# Display the word cloud
if 'wordcloud' in locals():
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # Turn off axes
    plt.show()
else:
    print("Word cloud not available for display.")


Word cloud not available for display.


**Save and Export the Word Cloud**

In [58]:
# Save the word cloud as an image file
if 'wordcloud' in locals():
    output_path = "facebook_wordcloud.png"
    wordcloud.to_file(output_path)
    print(f"Word cloud saved as '{output_path}'")

    # Download the saved image
    from google.colab import files
    files.download(output_path)
else:
    print("Word cloud not available for export.")


Word cloud not available for export.
