# Transcription and Sentiment Analysis


**Table of Contents**  

1.  [Run Transcriptions on MP4 Videos](#sec1)
2.  [Apply Sentiment Analysis](#sec2)



In [1]:
from openai import OpenAI
import requests
from docx import Document
import os
import whisper #pip install openai-whisper
import pandas as pd
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import json 
import matplotlib.pyplot as plt
import librosa
import langid
import csv

cwd = os.getcwd()

<a id="sec1"></a>

## 1. Run Transcription on downloaded videos <b>
- Make sure to change the folder path and date name for each batch

In [13]:
# Load the WhisperProcessor and WhisperForConditionalGeneration models
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

# set forced_decoder_ids to None for unforced context tokens
model.config.forced_decoder_ids = None  



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
#Only run to create csv for all videos
# Specify the directory you want to read
folder_path = r"C:\Users\ashle\OneDrive\School\CS_315\mini_lecture\video_files" 

# Get all file names in the specified path
file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Write file names to a CSV file
with open("sentiment_Manually.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    # Optional: Write headers if needed
    writer.writerow(['Filename'])
    # Write file names
    for name in file_names:
        writer.writerow([name])

print(f"File names have been written to sentiment_Manually.csv")


File names have been written to sentiment_Manually.csv


In [44]:
#only run this cell when need to get transcriptions
import soundfile as sf
# Define paths
video_folder_path = r"C:\Users\ashle\OneDrive\School\CS_315\mini_lecture\video_files"
output_path = "txt-transcripts/"
os.makedirs(output_path, exist_ok=True)

# define sampling rate
sampling_rate = 16000  

for filename in os.listdir(video_folder_path):

    if filename.endswith(".mp4"):
        base_filename = filename[:-4]

        audio_path = os.path.join(video_folder_path, filename)

        model = whisper.load_model('base')
        result = model.transcribe(audio_path, fp16=False)
        # write transcription to a text file named after the video ID
        with open(os.path.join(output_path, f"{base_filename}.txt"), "w", encoding="utf-8") as txt:
            txt.write(result['text'])
        print(f'Finished writing transcription for: {base_filename}')

    else:
        # If audio file not found (this makes sense actually because all of these videos were not downloaded)
        print(f"Audio for video ID {filename} not found.")


Finished writing transcription for: @401.leo_video_7358973934638763307
Finished writing transcription for: @99_warstories_video_7336952062644620586
Finished writing transcription for: @a.i.newz_video_7215890031607549226
Finished writing transcription for: @abcnews_video_7089162430370123051
Finished writing transcription for: @abcnews_video_7303688413172075819
Finished writing transcription for: @ahadthecpa_video_7330267638679096619
Finished writing transcription for: @airosent_video_7180511110708170027
Finished writing transcription for: @alcofribas_video_7234217555014077742
Finished writing transcription for: @alex43274_video_6916362879042145541
Finished writing transcription for: @alvarntg_video_7328413123709357358
Finished writing transcription for: @amandagaskins78_video_7209843718885608747
Finished writing transcription for: @angryvoters_video_7092123027076304170
Finished writing transcription for: @anothermarco_video_7049840313443077381
Finished writing transcription for: @anti_n

<a id="sec2"></a>

## 2. Sentiment Analysis <b>
- Make sure to change the folder path

In [45]:
import os
import pandas as pd

# Specify the path to your folder containing the txt files
folder_path = r"C:\Users\ashle\OneDrive\School\CS_315\mini_lecture\txt-transcripts"

# List all the .txt files in the folder
files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

data = []

# Loop over the files and read each file
for file_name in files:
    with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read()
        data.append({'File Name': file_name, 'Content': content})

df = pd.DataFrame(data)
print(df)


                                       File Name  \
0         @401.leo_video_7358973934638763307.txt   
1   @99_warstories_video_7336952062644620586.txt   
2        @a.i.newz_video_7215890031607549226.txt   
3         @abcnews_video_7089162430370123051.txt   
4         @abcnews_video_7303688413172075819.txt   
..                                           ...   
95            @cnn_video_7320728660955122986.txt   
96            @cnn_video_7330341214400744750.txt   
97  @coachd_speaks_video_6915512527988313349.txt   
98        @_51zone_video_7358867679802887466.txt   
99       @__ninzim_video_7331213276577025322.txt   

                                              Content  
0    I got you. I got you. Jock your da-tchin' on,...  
1                                Thanks for watching!  
2    The papal bull Eternity reguses a formal docu...  
3    Marjorie Taylor Greene, like every member of ...  
4    We just learned that the Supreme Court has re...  
..                                     

### OpenAI Whisper Classification

In [59]:
import openai
from openai import OpenAI

# Set your OpenAI API key (replace with your actual key)
openai.api_key = "{enter API key here or ask Eni for key}" 

# Initialize the OpenAI client
client = OpenAI(api_key=openai.api_key)

def sentiment_analysis(text):
    # Create a prompt for the model
    prompt = f"""You are trained to analyze and detect the sentiment of the given text.
    Analyze the following text and respond whether the text is: Positive, Negative, or Neutral.
    Reply one word repsonse.
    {text}"""

    # Call the OpenAI API to generate a response
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # Use a powerful model for sentiment analysis
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1,  # Limit response to a single word
        temperature=0  # Keep response consistent
    )

    # Extract the sentiment from the response
    sentiment = response.choices[0].message.content.strip().lower()

    return sentiment

# Example usage
input_text = "I’m very happy with the product!"
sentiment = sentiment_analysis(input_text)
print(input_text, "==== Sentiment is:", sentiment)

I’m very happy with the product! ==== Sentiment is: positive


In [68]:
for index, row in df.iterrows():
    transcript = row['Content']
    if len(transcript) ==0:
         df.loc[index, 'Sentiment_GPT'] = None
    else:
        sentiment = sentiment_analysis(transcript)
        df.loc[index, 'Sentiment_GPT'] = sentiment

df.to_csv('all_sentiment.csv', index=False)  # Set index=False to not write row indices



                                       File Name  \
0         @401.leo_video_7358973934638763307.txt   
1   @99_warstories_video_7336952062644620586.txt   
2        @a.i.newz_video_7215890031607549226.txt   
3         @abcnews_video_7089162430370123051.txt   
4         @abcnews_video_7303688413172075819.txt   
..                                           ...   
95            @cnn_video_7320728660955122986.txt   
96            @cnn_video_7330341214400744750.txt   
97  @coachd_speaks_video_6915512527988313349.txt   
98        @_51zone_video_7358867679802887466.txt   
99       @__ninzim_video_7331213276577025322.txt   

                                              Content Sentiment Sentiment_GPT  
0    I got you. I got you. Jock your da-tchin' on,...  negative      negative  
1                                Thanks for watching!  positive      positive  
2    The papal bull Eternity reguses a formal docu...   neutral       neutral  
3    Marjorie Taylor Greene, like every member of ...  

In [72]:
#df = df.drop('Sentiment', axis=1)

print(df)


                                       File Name  \
0         @401.leo_video_7358973934638763307.txt   
1   @99_warstories_video_7336952062644620586.txt   
2        @a.i.newz_video_7215890031607549226.txt   
3         @abcnews_video_7089162430370123051.txt   
4         @abcnews_video_7303688413172075819.txt   
..                                           ...   
95            @cnn_video_7320728660955122986.txt   
96            @cnn_video_7330341214400744750.txt   
97  @coachd_speaks_video_6915512527988313349.txt   
98        @_51zone_video_7358867679802887466.txt   
99       @__ninzim_video_7331213276577025322.txt   

                                              Content Sentiment_GPT  
0    I got you. I got you. Jock your da-tchin' on,...      negative  
1                                Thanks for watching!      positive  
2    The papal bull Eternity reguses a formal docu...       neutral  
3    Marjorie Taylor Greene, like every member of ...      negative  
4    We just learned that

### Vader Sentiment Classification

In [77]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

text = "I did, but I can't feel my arms. Dad, guess what? Better. I got into Eckerd. You serious? Yes. And guess what? I got a $12,000 scholarship. Why do we listen to our best friends when they tell you that something's going to happen? Oh my god. Oh my god."
 
# function to print sentiments
# of the sentence.
def sentiment_Vader(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    #print("Overall sentiment dictionary is : ", sentiment_dict)
    # print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    # print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    # print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
    #print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        return "positive"
 
    elif sentiment_dict['compound'] <= - 0.05 :
        return "negative"
 
    else :
        return "neutral"

sentiment_Vader(text)

#https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/

'positive'

In [79]:
for index, row in df.iterrows():
    transcript = row['Content']
    if len(transcript) ==0:
         df.loc[index, 'Sentiment_Vader'] = None
    else:
        sentiment = sentiment_Vader(transcript)
        df.loc[index, 'Sentiment_Vader'] = sentiment

df.to_csv('all_sentiment.csv', index=False)  # Set index=False to not write row indices

