In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

global_df = pd.DataFrame()

keywords_dict = {
    'prima_facie_negligence': ['prima facie', 'negligence', 'negligent'],
    'rear_end_collision': ['rear end', 'rear-end', 'collision', 'crash'],
    'sudden_stop': ['sudden stop', 'abrupt stop', 'unexpected halt'],
    'traffic_conditions': ['traffic', 'congestion', 'heavy flow'],
    'weather_conditions': ['weather', 'rain', 'snow', 'ice', 'fog'],
    'vehicle_defects': ['defect', 'malfunction', 'brake failure', 'mechanical issue'],
    'driver_distraction': ['distraction', 'distracted', 'cell phone', 'texting'],
    'speed': ['speeding', 'excessive speed', 'over the limit'],
    'intoxication': ['drunk', 'alcohol', 'intoxicated', 'under the influence'],
    'road_conditions': ['pothole', 'construction', 'poorly maintained'],
}

In [None]:
def read_file(file_path): #Properly reads through the file.
    with open(file_path, 'r') as file:
        return file.read()

def preprocess_text(text): # Using natural language toolkit, we take our text and tokenize it, removing necessary stopwords and returning proper processed text tokens.
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens



In [None]:
def identify_keywords(tokens):
    identified_keywords = {key: 0 for key in keywords_dict}  # Count of keywords found
    found_keywords = {key: [] for key in keywords_dict}       # Actual keywords found

    for i in range(len(tokens)): # Iterate through all tokens and and append noticeable keywords to found_keywords. This will form the basis of recognizing user input for notable facts,
        for key, values in keywords_dict.items():     # and these keywords will help us identify relevant precedents.
            for phrase in values:
                phrase_tokens = phrase.split()
                if i + len(phrase_tokens) <= len(tokens):
                    if ' '.join(tokens[i:i+len(phrase_tokens)]) == phrase:
                        identified_keywords[key] += 1
                        found_keywords[key].append(phrase)
                        break

    return identified_keywords, found_keywords

In [None]:
def process_file(file_path): # File processing for the input (as a text file).
    global global_df

    try:
        text = read_file(file_path)
        tokens = preprocess_text(text)
        keywords, found_keywords = identify_keywords(tokens)  # Get both counts and found keywords
        df = pd.DataFrame([keywords])

        # Extract a unique name identifier from the file
        unique_name = os.path.splitext(os.path.basename(file_path))[0]

        # Make it the first dataframe column
        df.insert(0, 'unique_name', unique_name)

        df['file_name'] = os.path.basename(file_path)
        df['file_path'] = file_path
        df['total_words'] = len(tokens)

        # Add found keywords as separate columns
        for key in found_keywords:
            df[key + '_found'] = ', '.join(found_keywords[key]) if found_keywords[key] else None

        # Add to the global DataFrame
        global_df = pd.concat([global_df, df], ignore_index=True)

        print(f"File processed: {file_path}")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")


In [None]:
def main(): # Main file creates a global dataframe and takes in multiple files to the global dataframe. This will eventually be modified for our website inputs
    global global_df

    while True:
        file_path = input("Enter the path to a text file (or 'q' to quit): ").strip()

        if file_path.lower() == 'q':
            break

        process_file(file_path)

    if not global_df.empty:
        # Calculate percentages
        keyword_columns = list(keywords_dict.keys())
        global_df['total_keywords'] = global_df[keyword_columns].sum(axis=1)

        for column in keyword_columns:
            global_df[f'{column}_percentage'] = (global_df[column] / global_df['total_keywords']) * 100

        # Sort the DataFrame
        global_df = global_df.sort_values(by='total_keywords', ascending=False)

        print("\nFinal Results:")
        print(global_df)

        # Save results to CSV
        global_df.to_csv('text_analysis_results.csv', index=False)
        print("Results saved to 'text_analysis_results.csv'")
    else:
        print("No files were processed.")




In [None]:
global_df = pd.DataFrame() # To Reset Dataframe

In [None]:
global_df[['unique_name','driver_distraction','driver_distraction_found','driver_distraction_percentage','rear_end_collision','rear_end_collision_found','rear_end_collision_percentage']]

Unnamed: 0,unique_name,driver_distraction,driver_distraction_found,driver_distraction_percentage,rear_end_collision,rear_end_collision_found,rear_end_collision_percentage
1,sample-car-accident-cases,3,"distracted, cell phone, distracted",8.108108,14,"rear end, collision, rear end, collision, coll...",37.837838
0,legal-car-accident-case,3,"distracted, distracted, distracted",18.75,5,"collision, collision, rear end, collision, rea...",31.25


In [None]:
global_df.columns


Index(['unique_name', 'prima_facie_negligence', 'rear_end_collision',
       'sudden_stop', 'traffic_conditions', 'weather_conditions',
       'vehicle_defects', 'driver_distraction', 'speed', 'intoxication',
       'road_conditions', 'file_name', 'file_path', 'total_words',
       'prima_facie_negligence_found', 'rear_end_collision_found',
       'sudden_stop_found', 'traffic_conditions_found',
       'weather_conditions_found', 'vehicle_defects_found',
       'driver_distraction_found', 'speed_found', 'intoxication_found',
       'road_conditions_found', 'total_keywords',
       'prima_facie_negligence_percentage', 'rear_end_collision_percentage',
       'sudden_stop_percentage', 'traffic_conditions_percentage',
       'weather_conditions_percentage', 'vehicle_defects_percentage',
       'driver_distraction_percentage', 'speed_percentage',
       'intoxication_percentage', 'road_conditions_percentage'],
      dtype='object')

def process_file(file_path):
    global global_df

    try:
        text = read_file(file_path)
        tokens = preprocess_text(text)
        keywords = identify_keywords_2(tokens)
        df = pd.DataFrame([keywords])

        # Extract a unique name identifier from the file
        unique_name = os.path.splitext(os.path.basename(file_path))[0]

        # Make it the first dataframe column
        df.insert(0, 'unique_name', unique_name)


        df['file_name'] = os.path.basename(file_path)
        df['file_path'] = file_path
        df['total_words'] = len(tokens)

        # Add to the global DataFrame
        global_df = pd.concat([global_df, df], ignore_index=True)

        print(f"File processed: {file_path}")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")



In [None]:
main()

Enter the path to a text file (or 'q' to quit): /content/drive/MyDrive/legal-car-accident-case.txt
File processed: /content/drive/MyDrive/legal-car-accident-case.txt
Enter the path to a text file (or 'q' to quit): /content/drive/MyDrive/sample-car-accident-cases.txt
File processed: /content/drive/MyDrive/sample-car-accident-cases.txt
Enter the path to a text file (or 'q' to quit): q

Final Results:
                 unique_name  prima_facie_negligence  rear_end_collision  \
1  sample-car-accident-cases                       9                  14   
0    legal-car-accident-case                       3                   5   

   sudden_stop  traffic_conditions  weather_conditions  vehicle_defects  \
1            1                   6                   4                0   
0            0                   4                   1                0   

   driver_distraction  speed  intoxication  ...  \
1                   3      0             0  ...   
0                   3      0             