In [None]:
import pandas as pd
import os
import json
import csv
import nltk
import spacy
from collections import defaultdict
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import os
import pandas as pd

# Load Loughran-McDonald sentiments

In [None]:
def load_loughran_mcdonald_dictionary(filepath):
    # Load each sheet as a dictionary 
    sentiments = {}
    sheet_names = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'StrongModal', 'WeakModal', 'Constraining']
    
    for sheet in sheet_names:
        words = pd.read_excel(filepath, sheet_name=sheet, header=None).squeeze().tolist()
        sentiments[sheet] = set(word.lower() for word in words) 
    
    return sentiments

dictionary_path = 'LoughranMcDonald_SentimentWordLists_2018.xlsx'
sentiment_dictionary = load_loughran_mcdonald_dictionary(dictionary_path)

# Calculate sentiment scores based on tokens

In [None]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer, stop words, and spaCy model
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# All possible sentiment categories including modals and raw modal counts
sentiment_categories = ["negative", "positive", "uncertainty", "litigious", "constraining",
                        "strong_modal_positive", "strong_modal_negative", "strong_modal_uncertainty",
                        "strong_modal_litigious", "strong_modal_constraining",
                        "weak_modal_positive", "weak_modal_negative", "weak_modal_uncertainty",
                        "weak_modal_litigious", "weak_modal_constraining",
                        "strong_modal_raw_count", "weak_modal_raw_count"]

input_dir = 'Tickers_Json'
output_dir = 'Sentiment_CSVs2'
os.makedirs(output_dir, exist_ok=True)

# preprocess text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

# calculate sentiment scores based on tokens
def calculate_sentiment(tokens, sentiment_words):
    return sum(1 for token in tokens if token in sentiment_words)

# 
def parse_dependencies(raw_text):
    doc = nlp(raw_text)
    syntactic_sentiments = defaultdict(int)

    for token in doc:
        if token.lemma_ in sentiment_dictionary["StrongModal"]:
            modal_type = "strong_modal"
            syntactic_sentiments["strong_modal_raw_count"] += 1
        elif token.lemma_ in sentiment_dictionary["WeakModal"]:
            modal_type = "weak_modal"
            syntactic_sentiments["weak_modal_raw_count"] += 1
        else:
            continue

        for child in token.children:
            if child.dep_ in ("dobj", "ccomp", "xcomp", "advcl", "acomp", "neg", "amod", "npadvmod"):
                action = child.lemma_

                if action in sentiment_dictionary["Positive"]:
                    syntactic_sentiments[f"{modal_type}_positive"] += 1
                elif action in sentiment_dictionary["Negative"]:
                    syntactic_sentiments[f"{modal_type}_negative"] += 1
                elif action in sentiment_dictionary["Uncertainty"]:
                    syntactic_sentiments[f"{modal_type}_uncertainty"] += 1
                elif action in sentiment_dictionary["Litigious"]:
                    syntactic_sentiments[f"{modal_type}_litigious"] += 1
                elif action in sentiment_dictionary["Constraining"]:
                    syntactic_sentiments[f"{modal_type}_constraining"] += 1

    return syntactic_sentiments

for file_name in os.listdir(input_dir):
    if file_name.endswith('.json'):
        file_path = os.path.join(input_dir, file_name)
        company_name = os.path.splitext(file_name)[0]
        output_file = os.path.join(output_dir, f"{company_name}_sentiments.csv")

        with open(file_path, 'r') as file:
            data = json.load(file)
        year_quarter_sentiments = defaultdict(lambda: defaultdict(lambda: {'sentiments': defaultdict(int), 'dates': []}))

        for entry in data:
            year = entry['year']
            quarter = entry['quarter']
            date = entry['time']
            transcript = ' '.join([speech for segment in entry['transcript'] for speech in segment['speech']])

            for category in sentiment_categories:
                year_quarter_sentiments[year][quarter]['sentiments'][category] = 0

            tokens = preprocess_text(transcript)

            year_quarter_sentiments[year][quarter]['sentiments']['negative'] += calculate_sentiment(tokens, sentiment_dictionary["Negative"])
            year_quarter_sentiments[year][quarter]['sentiments']['positive'] += calculate_sentiment(tokens, sentiment_dictionary["Positive"])
            year_quarter_sentiments[year][quarter]['sentiments']['uncertainty'] += calculate_sentiment(tokens, sentiment_dictionary["Uncertainty"])
            year_quarter_sentiments[year][quarter]['sentiments']['litigious'] += calculate_sentiment(tokens, sentiment_dictionary["Litigious"])
            year_quarter_sentiments[year][quarter]['sentiments']['constraining'] += calculate_sentiment(tokens, sentiment_dictionary["Constraining"])

            syntactic_sentiments = parse_dependencies(transcript)
            for sentiment, count in syntactic_sentiments.items():
                year_quarter_sentiments[year][quarter]['sentiments'][sentiment] += count

            year_quarter_sentiments[year][quarter]['dates'].append(date)

        # Write sentiment data to CSV
        with open(output_file, mode='w', newline='') as file:
            writer = csv.writer(file)
            header = ['ID', 'Year', 'Quarter', 'Date'] + sentiment_categories
            writer.writerow(header)

            for year, quarters in year_quarter_sentiments.items():
                for quarter, data in quarters.items():
                    unique_id = f"{year}_Q{quarter}"
                    earliest_date = min(data['dates'])  
                    row = [unique_id, year, quarter, earliest_date]
                    for category in sentiment_categories:
                        row.append(data['sentiments'][category])
                    writer.writerow(row)

        print(f"Sentiment data for {company_name} saved to {output_file}")


# Overall Sentiment Score calculation

In [None]:
input_dir = 'Sentiment_CSVs2'
output_dir = 'Sentioment_Scores2'
os.makedirs(output_dir, exist_ok=True)


# Sentiment Score calculation functions
def calculate_nss(row):
    return (row['positive'] - row['negative']) / (row['positive'] + row['negative'] + 1e-6)  


def calculate_wss(row):
    weights = {
        'positive': 1.0, 'negative': -1.0,
        'strong_modal_positive': 1.5, 'strong_modal_negative': -1.5,
        'weak_modal_positive': 0.5, 'weak_modal_negative': -0.5
    }
    score = 0.0
    for sentiment, weight in weights.items():
        score += row[sentiment] * weight
    return score / (sum(abs(weight) for weight in weights.values()))


def calculate_overall_sentiment(row):
    weights = {
        'positive': 1.0,
        'negative': -1.0,
        'uncertainty': -0.5,
        'litigious': -0.5,
        'constraining': -0.5,
        'strong_modal_raw_count': -0.2,
        'weak_modal_raw_count': -0.1
    }
    overall_score = sum(row[category] * weight for category, weight in weights.items())
    return overall_score

for file_name in os.listdir(input_dir):
    if file_name.endswith('.csv'):
        file_path = os.path.join(input_dir, file_name)

        df = pd.read_csv(file_path)

        # Calculate NSS, WSS, and Overall Sentiment Score
        df['NSS'] = df.apply(calculate_nss, axis=1)
        df['WSS'] = df.apply(calculate_wss, axis=1)
        df['Overall_Sentiment_Score'] = df.apply(calculate_overall_sentiment, axis=1)


        # Determine binary sentiment based on NSS
        def binary_sentiment(nss):
            if nss > 0.1:
                return 'positive'
            elif nss < -0.1:
                return 'negative'
            else:
                return 'neutral'


        df['binary_sentiment'] = df['NSS'].apply(binary_sentiment)

        output_file = os.path.join(output_dir, file_name)
        df.to_csv(output_file, index=False)

        print(f"Processed {file_name} and saved to {output_file}")

