In [None]:
import openai
import requests
import copyleaks
import numpy as np
import os
import json
import plotly.express as px
import pandas as pd
pd.set_option('display.max_colwidth', 400)

from DetectGPT.model import GPT2PPLV2 as GPT2PPL
from GLTR.api import LM as GLTR
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, pipeline
from transformers import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm
tqdm.pandas()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
# 1 indicated AI produced, 0 indicates human produced

In [None]:
def readJSON(location):
    with open(location, encoding = "utf-8") as outfile:
        data = json.load(outfile)
    return pd.DataFrame(data)

In [None]:
def getRobertaScore(text):
    """
    https://huggingface.co/roberta-base-openai-detector
    """
    model = pipeline("text-classification", model="roberta-base-openai-detector")
    vals = model(text, top_k=3)
    flag = sorted(vals, key = lambda x: x["score"])[-1]["label"]
    flag = 1 if flag == "Fake" else 0
    
    return {**{f"roberta_score_{j['label']}": j["score"] for j in vals}, **{"roberta_Label": flag}}
        

In [None]:
def getRobertaLargeScore(text):
    """
    https://huggingface.co/roberta-large-openai-detector
    """
    model = pipeline("text-classification", model="roberta-large-openai-detector")
    vals = model(text, top_k=3)
    flag = sorted(vals, key = lambda x: x["score"])[-1]["label"]
    flag = 1 if flag == "LABEL_0" else 0
    
    d = {"LABEL_1":"0", "LABEL_0":1}
    
    return {**{f"roberta_large_score_{d[j['label']]}": j["score"] for j in vals}, **{"roberta_large_Label": flag}}


In [None]:
def getDetectGPTScore(text):
    """
    https://www.arxiv-vanity.com/papers/2301.11305/
    Slow though...
    """
    model = GPT2PPL()
    vals = model(text, len(text), "v1.1")
    flag = 1 - vals[0]
    
    return {"gpt_detect_mean_score": vals[2], 
            "gpt_detect_mean_probability": vals[3], 
            "gpt_detect_Label": flag}

In [None]:
def getGPTZeroScore(text):
    """
    https://gptzero.me/
    """
    model = GPT2PPL()
    vals = model(text, None, "v1")
    flag = 1 - vals[0]["label"]
    
    return {"gpt_zero_Perplexity": vals[0]["Perplexity"], 
            "gpt_zero_Burtiness": vals[0]["Burstiness"], 
            "gpt_zero_Label": flag}

In [None]:
def getGLTRScore(text, threshold = 0.7):
    """
    http://gltr.io/
    """
    gltr = GLTR()
    def f(x):
        return int(np.where(
            x<10, 0, np.where(x<100, 1, np.where(x<1000, 2, 3))))

    def p(x, vals):
        return sum(np.array(vals) == x)/len(vals)

    valsALL = gltr.check_probabilities(text)
    vals = [f(i[0]) for i in valsALL["real_topk"]]
    
    flag = 1 if p(0, vals) > threshold else 0 
    
    return {"gltr_0": p(0, vals), 
            "gltr_1": p(1, vals),
            "gltr_2": p(2, vals),
            "gltr_3": p(3, vals),
            "gltr_Label": flag}

In [None]:
# %%time
# roberta_dict = getRobertaScore(textSub)
# roberta_large_dict = getRobertaLargeScore(textSub)
# gpt_zero_dict = getGPTZeroScore(textSub)
# gpt_detect_dict = getDetectGPTScore(textSub)
# gltr_dict = getGLTRScore(textSub)

# {**roberta_dict, **gpt_zero_dict, **gpt_detect_dict, **roberta_large_dict, **gltr_dict}

In [None]:
def isAIGenerated(text, chunksize = 300, step_size = 300):
    """
    Detect if a function is AI generated or not.
    This will return several scores, which are
        * GPTZero - https://gptzero.me/
        * Roberta - https://huggingface.co/roberta-base-openai-detector
        * Roberta Large - https://huggingface.co/roberta-large-openai-detector
        * Potentially GLTR in future... - http://gltr.io/dist/index.html
    
    ::param text: (str)
    ::param chunksize: (int)
    ::param step_size: (int)
    
    ::return: (dict[str: float])
    """
    try:
        data = pd.DataFrame()
        savedText = []
        textSplit = text.split()

        for i in range(np.max(1, len(textSplit)//step_size)):
            textSub = " ".join(textSplit[step_size*i:(step_size*i) + chunksize])

            roberta_dict = getRobertaScore(textSub)
            roberta_large_dict = getRobertaLargeScore(textSub)
            gpt_zero_dict = getGPTZeroScore(textSub)
            gpt_detect_dict = {}#getDetectGPTScore(textSub)
            gltr_dict = getGLTRScore(textSub)

            data = data.append({**roberta_dict, **gpt_zero_dict, **gpt_detect_dict, **roberta_large_dict, **gltr_dict}, ignore_index=True)
            savedText += [textSub]
        data["Text"] = savedText
        return data
    except Exception as e:
        print(e)
        print(len(textSub))
        return pd.DataFrame()

In [None]:
allData = pd.DataFrame()
metaData = pd.DataFrame()

for LOC in os.listdir("TranscriptsHistory"):
    print("Starting ", LOC)
    data = readJSON("TranscriptsHistory/"+LOC)
    
    for row in tqdm(data.values):
        symbol = row[0]
        year = row[2]
        quarter = row[3]
        conversation = row[-1]
        
        values = isAIGenerated(conversation)
        metrics = values.describe().reset_index()
        
        metrics["symbol"] = symbol
        metrics["year"] = year
        metrics["quarter"] = quarter
        
        allData = pd.concat([allData, metrics])
        metaData = pd.concat([metaData, values])
    
    allData.to_csv("MetricsOfEarningCallsV2.csv")
    metaData.to_excel("MetaOfEarningCallsV2.xlsx")