In [1]:
import openai
import requests
import copyleaks
import numpy as np
import os
import json
import plotly.express as px
import pandas as pd
pd.set_option('display.max_colwidth', 400)

from DetectGPT.model import GPT2PPLV2 as GPT2PPL
from GLTR.api import LM as GLTR
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, pipeline
from transformers import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm
tqdm.pandas()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

Found API <class 'GLTR.api.LM'> with name gpt-2-small


In [2]:
# 1 indicated AI produced, 0 indicates human produced

In [3]:
def readJSON(location):
    with open(location, encoding = "utf-8") as outfile:
        data = json.load(outfile)
    return pd.DataFrame(data)

In [4]:
def getRobertaScore(text):
    """
    https://huggingface.co/roberta-base-openai-detector
    """
    model = pipeline("text-classification", model="roberta-base-openai-detector")
    vals = model(text, top_k=3)
    flag = sorted(vals, key = lambda x: x["score"])[-1]["label"]
    flag = 1 if flag == "Fake" else 0
    
    return {**{f"roberta_score_{j['label']}": j["score"] for j in vals}, **{"roberta_Label": flag}}
        

In [5]:
def getRobertaLargeScore(text):
    """
    https://huggingface.co/roberta-large-openai-detector
    """
    model = pipeline("text-classification", model="roberta-large-openai-detector")
    vals = model(text, top_k=3)
    flag = sorted(vals, key = lambda x: x["score"])[-1]["label"]
    flag = 1 if flag == "LABEL_0" else 0
    
    d = {"LABEL_1":"0", "LABEL_0":1}
    
    return {**{f"roberta_large_score_{d[j['label']]}": j["score"] for j in vals}, **{"roberta_large_Label": flag}}


In [6]:
def getDetectGPTScore(text):
    """
    https://www.arxiv-vanity.com/papers/2301.11305/
    Slow though...
    """
    model = GPT2PPL()
    vals = model(text, len(text), "v1.1")
    flag = 1 - vals[0]
    
    return {"gpt_detect_mean_score": vals[2], 
            "gpt_detect_mean_probability": vals[3], 
            "gpt_detect_Label": flag}

In [7]:
def getGPTZeroScore(text):
    """
    https://gptzero.me/
    """
    model = GPT2PPL()
    vals = model(text, None, "v1")
    flag = 1 - vals[0]["label"]
    
    return {"gpt_zero_Perplexity": vals[0]["Perplexity"], 
            "gpt_zero_Burtiness": vals[0]["Burstiness"], 
            "gpt_zero_Label": flag}

In [8]:
def getGLTRScore(text, threshold = 0.7):
    """
    http://gltr.io/
    """
    gltr = GLTR()
    def f(x):
        return int(np.where(
            x<10, 0, np.where(x<100, 1, np.where(x<1000, 2, 3))))

    def p(x, vals):
        return sum(np.array(vals) == x)/len(vals)

    valsALL = gltr.check_probabilities(text)
    vals = [f(i[0]) for i in valsALL["real_topk"]]
    
    flag = 1 if p(0, vals) > threshold else 0 
    
    return {"gltr_0": p(0, vals), 
            "gltr_1": p(1, vals),
            "gltr_2": p(2, vals),
            "gltr_3": p(3, vals),
            "gltr_Label": flag}

In [9]:
# %%time
# roberta_dict = getRobertaScore(textSub)
# roberta_large_dict = getRobertaLargeScore(textSub)
# gpt_zero_dict = getGPTZeroScore(textSub)
# gpt_detect_dict = getDetectGPTScore(textSub)
# gltr_dict = getGLTRScore(textSub)

# {**roberta_dict, **gpt_zero_dict, **gpt_detect_dict, **roberta_large_dict, **gltr_dict}

In [10]:
def isAIGenerated(text, chunksize = 300, step_size = 300):
    """
    Detect if a function is AI generated or not.
    This will return several scores, which are
        * GPTZero - https://gptzero.me/
        * Roberta - https://huggingface.co/roberta-base-openai-detector
        * Roberta Large - https://huggingface.co/roberta-large-openai-detector
        * Potentially GLTR in future... - http://gltr.io/dist/index.html
    
    ::param text: (str)
    ::param chunksize: (int)
    ::param step_size: (int)
    
    ::return: (dict[str: float])
    """
    try:
        data = pd.DataFrame()
        savedText = []
        textSplit = text.split()

        for i in range(np.max(1, len(textSplit)//step_size)):
            textSub = " ".join(textSplit[step_size*i:(step_size*i) + chunksize])

            roberta_dict = getRobertaScore(textSub)
            roberta_large_dict = getRobertaLargeScore(textSub)
            gpt_zero_dict = getGPTZeroScore(textSub)
            gpt_detect_dict = {}#getDetectGPTScore(textSub)
            gltr_dict = getGLTRScore(textSub)

            data = data.append({**roberta_dict, **gpt_zero_dict, **gpt_detect_dict, **roberta_large_dict, **gltr_dict}, ignore_index=True)
            savedText += [textSub]
        data["Text"] = savedText
        return data
    except Exception as e:
        print(e)
        print(len(textSub))
        return pd.DataFrame()

In [11]:
# import time

# textSub = """
# In the year 2050, artificial intelligence has transformed every aspect of human life. From self-driving cars to intelligent personal assistants, AI has become an indispensable part of our daily routine. People now live in smart homes where AI-powered systems control the temperature, lighting, and security with perfect precision.
# In the field of medicine, AI has revolutionized healthcare. Advanced algorithms analyze vast amounts of medical data to diagnose diseases at an early stage and recommend personalized treatments. Surgeries are performed with the assistance of surgical robots, ensuring unparalleled precision and minimizing human error.
# Education has also undergone a significant transformation. AI tutors provide personalized learning experiences, adapting to each student's unique needs and learning style. Virtual reality simulations create immersive environments for students to explore various subjects, making education more engaging and interactive than ever before.
# AI has even extended its reach to the creative realm. AI-generated artwork, music, and literature have gained recognition and appreciation among audiences worldwide. Machines have become proficient in composing symphonies, painting masterpieces, and crafting compelling stories that evoke emotions and captivate the imagination.
# While AI has brought numerous benefits, it has also raised ethical concerns. The increasing reliance on AI has led to discussions about job displacement and the potential loss of human touch in various industries. Striking the right balance between automation and human involvement remains a critical challenge for society.
# As AI continues to advance, the possibilities seem limitless. The future holds promises of further breakthroughs in areas such as quantum computing, deep learning, and neural interfaces. It is an exciting time to witness the ever-evolving landscape of artificial intelligence and its impact on shaping our future.
# """

# print("RoBERTa")
# st = time.time()
# roberta_dict = getRobertaScore(textSub)
# print(f"Took {round(time.time() - st, 2)} seconds")
# print("RoBERTa Large")
# st = time.time()
# roberta_large_dict = getRobertaLargeScore(textSub)
# print(f"Took {round(time.time() - st, 2)} seconds")
# print("GPTZero")
# st = time.time()
# gpt_zero_dict = getGPTZeroScore(textSub)
# print(f"Took {round(time.time() - st, 2)} seconds")
# print("DetectGPT")
# st = time.time()
# gpt_detect_dict = getDetectGPTScore(textSub)
# print(f"Took {round(time.time() - st, 2)} seconds")
# print("GLTR")
# st = time.time()
# gltr_dict = getGLTRScore(textSub)
# print(f"Took {round(time.time() - st, 2)} seconds")

# {**roberta_dict, **roberta_large_dict, **gpt_zero_dict, **gpt_detect_dict, **gltr_dict}

In [12]:
allData = pd.DataFrame()
metaData = pd.DataFrame()

for LOC in os.listdir("TranscriptsHistory2008"):
    print("Starting ", LOC)
    data = readJSON("TranscriptsHistory2008/"+LOC)
    
    for row in tqdm(data.values):
        symbol = row[0]
        year = row[2]
        quarter = row[3]
        conversation = row[-1]
        
        values = isAIGenerated(conversation)
        metrics = values.describe().reset_index()
        
        metrics["symbol"] = symbol
        metrics["year"] = year
        metrics["quarter"] = quarter
        
        allData = pd.concat([allData, metrics])
        metaData = pd.concat([metaData, values])
    
    allData.to_csv("MetricsOfEarningCallsV2.csv")
    metaData.to_excel("MetaOfEarningCallsV2.xlsx")

Starting  AAPL.json


  0%|          | 0/60 [00:00<?, ?it/s]

> [1;32m<ipython-input-10-2836636eb020>[0m(21)[0;36misAIGenerated[1;34m()[0m
[1;32m     19 [1;33m        [0mtextSplit[0m [1;33m=[0m [0mtext[0m[1;33m.[0m[0msplit[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     20 [1;33m[1;33m[0m[0m
[0m[1;32m---> 21 [1;33m        [1;32mfor[0m [0mi[0m [1;32min[0m [0mrange[0m[1;33m([0m[0mlen[0m[1;33m([0m[0mtextSplit[0m[1;33m)[0m[1;33m//[0m[0mstep_size[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     22 [1;33m            [0mtextSub[0m [1;33m=[0m [1;34m" "[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[0mtextSplit[0m[1;33m[[0m[0mstep_size[0m[1;33m*[0m[0mi[0m[1;33m:[0m[1;33m([0m[0mstep_size[0m[1;33m*[0m[0mi[0m[1;33m)[0m [1;33m+[0m [0mchunksize[0m[1;33m][0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     23 [1;33m[1;33m[0m[0m
[0m
ipdb> textSub
"Thank you. Good afternoon and thanks for joining us. Speaking today is Apple CFO Peter Oppenhe

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\daire\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-52a496b39586>", line 14, in <module>
    values = isAIGenerated(conversation)
  File "<ipython-input-10-2836636eb020>", line 21, in isAIGenerated
    for i in range(len(textSplit)//step_size):
  File "<ipython-input-4-0695403b57d7>", line 5, in getRobertaScore
    model = pipeline("text-classification", model="roberta-base-openai-detector")
  File "C:\Users\daire\anaconda3\lib\site-packages\transformers\pipelines\__init__.py", line 705, in pipeline
    config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
  File "C:\Users\daire\anaconda3\lib\site-packages\transformers\models\auto\configuration_auto.py", line 947, in from_pretrained
    config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path,

TypeError: object of type 'NoneType' has no len()