### Task 1: Extract insights from data

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    """Load JSON data from a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def extract_data(data):
    """Extract text and category from the data."""
    return [{'text': entry['text'], 'category': entry['category']} for entry in data]

def preprocess_data(df):
    """Preprocess the DataFrame by adding new features."""
    df['text_length'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_word_count'] = df['text'].apply(lambda x: len(set(x.split())))
    df['uppercase_word_count'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
    df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))
    df['contains_link'] = df['text'].apply(lambda x: 'http' in x)
    df['contains_parentheses'] = df['text'].apply(lambda x: '(' in x or ')' in x)
    df['contains_quotation_marks'] = df['text'].apply(lambda x: '"' in x or "'" in x)
    df['contains_5G'] = df['text'].apply(lambda x: '5G' in x or '5 G' in x)
    df['contains_bill_gates'] = df['text'].apply(lambda x: 'Bill Gates' in x)
    return df

def plot_histogram(df, category, title, xlabel, ylabel):
    """Plot a histogram of text length by category."""
    plt.figure(figsize=(10, 5))
    df[df['category'] == 'CONSPIRACY'][category].hist(alpha=0.5, label='CONSPIRACY', bins=30)
    df[df['category'] == 'CRITICAL'][category].hist(alpha=0.5, label='CRITICAL', bins=30)
    plt.legend()
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

def plot_bar_chart(df, feature, title, ylabel):
    """Plot a bar chart for a feature by category."""
    feature_by_category = df.groupby('category')[feature].mean()
    feature_by_category.plot(kind='bar', color=['blue', 'green'])
    plt.title(title)
    plt.ylabel(ylabel)
    plt.show()

def main():
    # Load and extract data
    data = load_data('Oppositional_thinking_analysis_dataset.json')
    data_extracted = extract_data(data)
    
    # Convert to DataFrame
    df = pd.DataFrame(data_extracted)
    print(df.head())
    
    # Preprocess data
    df = preprocess_data(df)
    
    # Plot distributions and bar charts
    plot_histogram(df, 'text_length', 'Text Length Distribution by Category', 'Number of Words', 'Frequency')
    plot_bar_chart(df, 'unique_word_count', 'Average Number of Unique Words by Category', 'Average Unique Words')
    plot_bar_chart(df, 'uppercase_word_count', 'Average Number of Uppercase Words by Category', 'Average Uppercase Words')
    plot_bar_chart(df, 'exclamation_count', 'Average Number of Exclamation Marks by Category', 'Average Exclamation Marks')
    
    # Plot feature-based bar charts
    plot_bar_chart(df, 'contains_link', 'Proportion of Texts Containing Links by Category', 'Proportion Containing Links')
    plot_bar_chart(df, 'contains_parentheses', 'Proportion of Texts Containing Parentheses by Category', 'Proportion Containing Parentheses')
    plot_bar_chart(df, 'contains_quotation_marks', 'Proportion of Texts Containing Quotation Marks by Category', 'Proportion Containing Quotation Marks')
    plot_bar_chart(df, 'contains_5G', 'Proportion of Texts Containing "5G" or "5 G" by Category', 'Proportion Containing "5G" or "5 G"')
    plot_bar_chart(df, 'contains_bill_gates', 'Proportion of Texts Containing "Bill Gates" by Category', 'Proportion Containing "Bill Gates"')

if __name__ == "__main__":
    main()


### Task 2: Pre-processing

In [None]:
import pandas as pd
import json
import re
import nltk
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the JSON data with utf-8 encoding, ensuring every character is beautifully preserved
with open('Oppositional_thinking_analysis_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract only the text and the category for the DataFrame
data_extracted = [{'text': entry['text'], 'category': entry['category']} for entry in data]

# Convert the refined list to a DataFrame
df = pd.DataFrame(data_extracted)

# Display the first few rows of the DataFrame
# print(df.head())

# Initialize inflect engine
p = inflect.engine()

# Define pre-processing functions
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Replace numbers with words
    text = re.sub(r'\d+', lambda x: p.number_to_words(x.group()), text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    text = ' '.join(tokens)
    # Strip white space
    text = text.strip()
    return text

# Apply pre-processing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the first few rows of the DataFrame after pre-processing
print(df.head())

# Save the pre-processed DataFrame to a new JSON file (optional)
df.to_json('preprocessed_oppositional_thinking_analysis_dataset.json', orient='records', lines=True, force_ascii=False)

### Task 3: Text classification

### Task 4: Textual similarity

### Bonus Task: Textual similarity