# Text

In [1]:
import os
import time
import google.generativeai as genai
import pandas as pd
from tqdm import tqdm
import pickle

genai.configure(api_key="YOUR_API_KEY")

def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file

def wait_for_files_active(files):
  """Waits for the given files to be active.

  Some files uploaded to the Gemini API need to be processed before they can be
  used as prompt inputs. The status can be seen by querying the file's "state"
  field.

  This implementation uses a simple blocking polling loop. Production code
  should probably employ a more sophisticated approach.
  """
  print("Waiting for file processing...")
  for name in (file.name for file in files):
    file = genai.get_file(name)
    while file.state.name == "PROCESSING":
      print(".", end="", flush=True)
      time.sleep(10)
      file = genai.get_file(name)
    if file.state.name != "ACTIVE":
      raise Exception(f"File {file.name} failed to process")
  print("...all files ready")
  print()

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
)

# TODO Make these files available on the local file system
# You may need to update the file paths
if os.path.exists('gemini_text/files.pkl'):
    with open('gemini_text/files.pkl', 'rb') as file:
        files = pickle.load(file)
else:
  files = []
  for i in range(1, 119):
      try:
        files.append(upload_to_gemini(f"transcript/Script_txt/video{str(i)}.txt", mime_type="text/txt"))
      except:
        print(f"video{str(i)}.txt is not found")
        continue

  # Some files have a processing delay. Wait for them to be ready.
  wait_for_files_active(files)
  with open('gemini_text/files.pkl', 'wb') as file_save:
      pickle.dump(files, file_save)

# add assetiveness if want more
# drop future orientation if want less
variables = ['innovativeness','proactivity','trustworthiness']
df = pd.DataFrame()
for var in variables:
  print(f"Analyzing {var}...")
  if os.path.exists(f"gemini_text/{var}.pkl"):
    with open(f"gemini_text/{var}.pkl", "rb") as file:
        result = pickle.load(file)
  else:
    result = {'video':[],var:[]}
  if len(result['video']) == len(files):
    continue
  for file in tqdm(files):
    if file.display_name in result['video']:
        continue
    chat_session = model.start_chat()
    response = chat_session.send_message([f"Analyze the {var.replace('-',' ')} of small business owners in entrepreneurship from the text file. Provide a rating on a scale of 1 to 10.",file], request_options={"timeout": 1000})
    result['video'].append(file.display_name)
    result[var].append(str(response.parts[0]))
    with open(f"gemini_text/{var}.pkl", "wb") as file:
        pickle.dump(result, file)
    
  df_temp = pd.DataFrame(result)
  df = df.merge(df_temp, on='video', how='outer') if not df.empty else df_temp
  df.to_csv('gemini_text/overall_score.csv')

Uploaded file 'video1.txt' as: https://generativelanguage.googleapis.com/v1beta/files/vgottmimvvcg
Uploaded file 'video2.txt' as: https://generativelanguage.googleapis.com/v1beta/files/q01lsq53k0jf
Uploaded file 'video3.txt' as: https://generativelanguage.googleapis.com/v1beta/files/lxdg9mxxxyop
Uploaded file 'video4.txt' as: https://generativelanguage.googleapis.com/v1beta/files/6bh1wcfica3h
Uploaded file 'video5.txt' as: https://generativelanguage.googleapis.com/v1beta/files/m8s91ly7215r
Uploaded file 'video6.txt' as: https://generativelanguage.googleapis.com/v1beta/files/bvsc6hd40loi
Uploaded file 'video7.txt' as: https://generativelanguage.googleapis.com/v1beta/files/t8vizrf83hk0
Uploaded file 'video8.txt' as: https://generativelanguage.googleapis.com/v1beta/files/wuhuudzotwjp
Uploaded file 'video9.txt' as: https://generativelanguage.googleapis.com/v1beta/files/btplp8m9svnk
Uploaded file 'video10.txt' as: https://generativelanguage.googleapis.com/v1beta/files/ncv6fjic38ce
Uploaded 

100%|██████████| 117/117 [16:45<00:00,  8.60s/it]


Analyzing proactivity...


100%|██████████| 117/117 [13:19<00:00,  6.83s/it]


Analyzing trustworthiness...


100%|██████████| 117/117 [14:17<00:00,  7.33s/it]


# Audio

In [1]:
import os

In [2]:
os.getcwd()

'/Users/zhouzhuofu/Desktop/RA/moduality'

def save_material():

    while True:
        # Prompt for the video number
        video_number = input("Enter the video number: ")
        
        # Validate video number
        if not video_number.isdigit():
            print("Finished saving materials.")
            return
        
        if int(video_number) <= 0:
            print("Finished saving materials.")
            return
        # Generate the file name
        file_name = f"sentiment/video{video_number}.txt"
        
        # Prompt for the material
        print(f"Paste the material below for video{video_number}. Press Enter twice to save:")
        lines = []
        line = input()
        if line == "":
            break
        lines.append(line)
        
        # Join the lines into a single string
        material = "\n".join(lines)
        
        # Save the material to the corresponding file
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(material)
        
        print(f"Material saved to {file_name}")

# Run the function
save_material()


In [5]:
result_dict ={'video':[], 'script':[], 'sentiment':[]}
for i in range(1, 118):
    if i == 37:
        continue
    script_path = f'Script_txt/video{str(i)}.txt'
    sentiment_path = f'sentiment/video{str(i)}.txt'
    with open(script_path, 'r', encoding='utf-8') as f:
        result_dict['script'].append(f.read())
    with open(sentiment_path, 'r',encoding='utf-8') as f:
        result_dict['sentiment'].append(f.read())
    result_dict['video'].append(i)
    

In [6]:
import pandas as pd

df = pd.DataFrame(result_dict)
df.head()

Unnamed: 0,video,script,sentiment
0,1,video1.mp4\n\nSpeaker1: [00:00:01] Hamoodur Ra...,**Segment 1: [00:00:01] Hamoodur Rahman. Moham...
1,2,"video2.mp4\n\nSpeaker1: [00:00:01] Mommy, is i...",1. **Segment: [00:00:01] to [00:00:12]** - ...
2,3,"video3.mp4\n\nSpeaker1: [00:00:00] Hi, I'm Rob...",**Segment 1** - **Timestamp:** [00:00:00] - ...
3,4,video4.mp4\n\nSpeaker1: [00:00:01] Hamoodur Ra...,### Segment 1: [00:00:01] - **Sentiment:** Neu...
4,5,video5.mp4\n\nSpeaker1: [00:00:08] I am a prof...,**Segment: [00:00:08] - [00:01:47]** 1. **Sen...


In [7]:
df.to_csv('startup_audio_sentiment_raw.csv')

In [60]:
import re
variables = ['assertiveness','creativity','enthusiasm','future-orientation','goal-orientation','optimism','proactive-personality','']
df = pd.DataFrame()
for var in variables:
    with open(f"{var}/result.txt", "r") as file:
        raw_output = file.read().lower().replace('\n','')
    text_cleaned = re.sub(r"video\s+(\d+)", r"video\1", raw_output)
    pattern = r"(video[0-9]+).*?([0-9]+(\.\d+)?/10)"
    matches = re.findall(pattern, text_cleaned)
    rating_dict = {'video':[], var:[]}
# Print results
    for match in matches:
        video, rating = int(match[0].replace('video','')), float(match[1].replace('/10',''))  # Extract video name and full rating
        #print(f"{video}: {rating}")  # Print video name and rating 
        rating_dict['video'].append(video)
        rating_dict[var].append(rating)
    df_temp = pd.DataFrame(rating_dict).drop_duplicates(subset='video')
    df = df.merge(df_temp, on='video', how='outer') if not df.empty else df_temp
    #print(rating_dict)
    #break
    

In [62]:
df.to_csv('startup_audio_var.csv')

In [56]:
import pandas as pd
import numpy as np

df = pd.DataFrame.from_dict(rating_dict).drop_duplicates(subset='video')

In [57]:
df

Unnamed: 0,video,assertiveness
0,1,6.0
1,2,7.0
2,3,8.0
3,4,6.0
5,5,5.0
...,...,...
111,112,7.0
112,113,7.0
114,115,6.0
115,117,7.0


In [40]:

pattern = r"(video[0-9]+).*?([0-9]+(\.\d+)?/10)"
matches = re.findall(pattern, text_cleaned)

# Print results
for match in matches:
    video, rating = match[0], match[1]  # Extract video name and full rating
    print(f"{video}: {rating}")

video1: 6/10
video2: 7/10
video3: 8/10
video4: 6/10
video1: 6/10
video5: 5/10
video6: 9/10
video7: 9/10
video6: 9/10
video8: 8/10
video9: 7/10
video10: 7/10
video11: 9/10
video12: 7/10
video13: 8/10
video14: 6/10
video15: 7/10
video16: 9/10
video17: 8/10
video18: 8/10
video19: 7/10
video20: 8/10
video21: 8/10
video22: 7/10
video23: 7/10
video24: 8/10
video25: 7/10
video26: 7/10
video27: 9/10
video28: 7/10
video29: 6/10
video30: 5/10
video31: 7/10
video32: 7/10
video33: 8/10
video34: 8/10
video35: 6/10
video36: 7/10
video38: 7/10
video39: 8/10
video40: 7/10
video41: 8/10
video42: 9/10
video43: 7/10
video44: 8/10
video45: 7/10
video46: 8/10
video47: 7/10
video48: 7/10
video49: 8/10
video50: 8/10
video51: 8/10
video52: 6/10
video51: 7/10
video55: 7/10
video56: 6/10
video57: 7/10
video58: 6/10
video59: 5/10
video60: 6/10
video61: 7/10
video62: 6/10
video63: 7/10
video64: 5/10
video65: 7/10
video66: 8/10
video67: 7/10
video68: 8/10
video69: 5/10
video70: 6/10
video71: 6/10
video72: 6/10
vid

# Video

In [1]:
!pip uninstall numpy
!pip install numpy
y


Found existing installation: numpy 2.2.1
Uninstalling numpy-2.2.1:
  Would remove:
    /opt/anaconda3/envs/Big_data/bin/f2py
    /opt/anaconda3/envs/Big_data/bin/numpy-config
    /opt/anaconda3/envs/Big_data/lib/python3.13/site-packages/numpy-2.2.1.dist-info/*
    /opt/anaconda3/envs/Big_data/lib/python3.13/site-packages/numpy/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m


In [2]:
import os
import time
import google.generativeai as genai
import pandas as pd
from tqdm import tqdm
import pickle

genai.configure(api_key="YOUR_API_KEY")

def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file

def wait_for_files_active(files):
  """Waits for the given files to be active.

  Some files uploaded to the Gemini API need to be processed before they can be
  used as prompt inputs. The status can be seen by querying the file's "state"
  field.

  This implementation uses a simple blocking polling loop. Production code
  should probably employ a more sophisticated approach.
  """
  print("Waiting for file processing...")
  for name in (file.name for file in files):
    file = genai.get_file(name)
    while file.state.name == "PROCESSING":
      print(".", end="", flush=True)
      time.sleep(10)
      file = genai.get_file(name)
    if file.state.name != "ACTIVE":
      raise Exception(f"File {file.name} failed to process")
  print("...all files ready")
  print()

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
)

# TODO Make these files available on the local file system
# You may need to update the file paths
if os.path.exists('gemini_video/files.pkl'):
    with open('gemini_video/files.pkl', 'rb') as file:
        files = pickle.load(file)
else:
  files = []
  for i in range(1, 119):
      try:
        files.append(upload_to_gemini(f"dataset/startup_videos/video{str(i)}.mp4", mime_type="video/mp4"))
      except:
        print(f"video{str(i)}.mp4 is not found")
        continue

  # Some files have a processing delay. Wait for them to be ready.
  wait_for_files_active(files)
  with open('gemini_video/files.pkl', 'wb') as file_save:
      pickle.dump(files, file_save)

# add assetiveness if want more
# drop future orientation if want less
variables = ['future-orientation','self-confidence','creativity','enthusiasm','optimism','proactive-personality','trustworthiness']
df = pd.DataFrame()
for var in variables:
  print(f"Analyzing {var}...")
  if os.path.exists(f"gemini_video/{var}.pkl"):
    with open(f"gemini_video/{var}.pkl", "rb") as file:
        result = pickle.load(file)
  else:
    result = {'video':[],var:[]}
  if len(result['video']) == len(files):
    continue
  for file in tqdm(files):
    if file.display_name in result['video']:
        continue
    chat_session = model.start_chat()
    response = chat_session.send_message([f"Analyze the {var.replace('-',' ')} of small business owners in entrepreneurship from the videos using the visual information. Provide a rating on a scale of 1 to 10.",file], request_options={"timeout": 1000})
    result['video'].append(file.display_name)
    result[var].append(str(response.parts[0]))
    with open(f"gemini_video/{var}.pkl", "wb") as file:
        pickle.dump(result, file)
    
  df_temp = pd.DataFrame(result)
  df = df.merge(df_temp, on='video', how='outer') if not df.empty else df_temp
  df.to_csv('gemini_video/overall_score.csv')

Analyzing future-orientation...


100%|██████████| 117/117 [44:35<00:00, 22.87s/it]


Analyzing self-confidence...
Analyzing creativity...
Analyzing enthusiasm...
Analyzing optimism...
Analyzing proactive-personality...
Analyzing trustworthiness...


 69%|██████▉   | 81/117 [00:16<00:07,  4.95it/s]


KeyboardInterrupt: 

In [88]:
variables = ['self-confidence','creativity','enthusiasm','optimism','proactive-personality','trustworthiness']
variables = ['trustworthiness']
df = pd.read_csv('gemini/overall_score.csv', index_col=0)
for var in variables:
  print(f"Analyzing {var}...")
  if os.path.exists(f"gemini/{var}.pkl"):
    with open(f"gemini/{var}.pkl", "rb") as file:
        result = pickle.load(file)
  else:
    result = {'video':[],var:[]}
  for file in tqdm(files):
    if file.display_name in result['video']:
        continue
    chat_session = model.start_chat()
    try:
      response = chat_session.send_message([f"Analyze the {var.replace('-',' ')} of small business owners in entrepreneurship from the vidoes using the visual information. Provide a rating on a scale of 1 to 10.",file], request_options={"timeout": 1000})
      result['video'].append(file.display_name)
      result[var].append(str(response.parts[0]))
    except Exception as e:
      print(f"Error: {e} for {file.display_name}")
    with open(f"gemini/{var}.pkl", "wb") as file:
        pickle.dump(result, file)
    
  df_temp = pd.DataFrame(result)
  df = df.merge(df_temp, on='video', how='outer') if not df.empty else df_temp
  df.to_csv('gemini/overall_score.csv')

Analyzing trustworthiness...


 70%|███████   | 82/117 [16:40<07:06, 12.20s/it]

Error: 504 Deadline Exceeded for video83.mp4


100%|██████████| 117/117 [39:35<00:00, 20.30s/it]


In [5]:
# clean the index and rating
import re

def clean_index(index_string):
    return int(index_string.replace('video','').replace('.mp4',''))

def retrieve_rating(rating_string):
    if isinstance(rating_string, str):
        cleaned_text = re.sub(r'\\[0-9]{3}', '', rating_string)
        cleaned_text = re.sub(r'\\n', '', cleaned_text)  
        cleaned_text = re.sub(r'/10', '', cleaned_text)
        cleaned_text = cleaned_text.replace('out of 10','')
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        numbers = re.findall(r'\b(?:10(?:\.0+)?|[1-9](?:\.\d+)?)\b', cleaned_text)
        if not numbers:
            return None
        elif len(numbers) == 1:
            return float(numbers[0])
        else:
            return numbers[-1]
    else:
        return rating_string
df = pd.read_csv('gemini_video/overall_score.csv', index_col=0)
df['video'] = df['video'].apply(clean_index)
for var in df.columns[1:]:
    
    df[var] = df[var].apply(retrieve_rating)
df.sort_values('video', inplace=True)
df.to_csv('gemini_video/overall_score_cleaned.csv', index=False)
df

Unnamed: 0,video,future-orientation
0,1,10.0
1,2,9.5
2,3,9.0
3,4,10
4,5,9
...,...,...
112,114,9.5
113,115,9.5
114,116,9.5
115,117,9.0
