In [1]:
import json
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re

In [2]:
# download punkt only once
# nltk.download('punkt')

In [3]:
# file containing the conversation data
file_path= 'conv_sample'

In [4]:
# initialize lists to store data for DataFrame creation.
post_lists= []
num_posts_list= []

# open the file in read mode ('r') using a 'with' statement.
with open(file_path, 'r') as file:
    
    # iterate through each line in the file.
    for line in file:
        # parse the JSON string into a Python dictionary.
        data= json.loads(line)
        
        # extract the "post_list" field and iterate through its items.
        post_list= data.get("post_list", [])
        
        # collect "text" fields and their counts for each "post_list".
        text_fields= [post.get("text", "") for post in post_list]
        num_posts= len(text_fields)
        
        # append data to the lists.
        post_lists.append(text_fields)
        num_posts_list.append(num_posts)

# create a Pandas DataFrame.
df= pd.DataFrame({'post_list': post_lists, 'num_posts': num_posts_list})

# print the DataFrame.
print(df)

                                               post_list  num_posts
0      [Update dependency @testing-library/user-event...          1
1      [I would like to work on this issue, go for it...          5
2      [Expand support for indirections. 1) Implement...          4
3      ["Can't Rename, no element found" on anything ...         20
4      [<!-- CLA-CHECK:130 -->\n&#10060; Author of th...          1
...                                                  ...        ...
19995  [Bump mongoose from 5.9.25 to 5.13.15. Bumps [...          1
19996  [Bump puppeteer from 19.9.0 to 19.10.0. Bumps ...          1
19997  [3Dモデル用シェーダー作成. 3Dモデルで使用するシェーダーを作成する。\r\n\r\n要...          1
19998  [### <span aria-hidden="true">👷</span> Deploy ...          1
19999                               [Feature/with front]          1

[20000 rows x 2 columns]


In [5]:
# calculate the mean of the 'num_posts' column
mean_num_posts= df['num_posts'].mean()

# print the mean
print(f"Mean of num_posts (Mean number of texts per conversation): {mean_num_posts}")

Mean of num_posts (Mean number of texts per conversation): 1.49645


In [6]:
# see the first 100 conversations
df["post_list"][:100]

0     [Update dependency @testing-library/user-event...
1     [I would like to work on this issue, go for it...
2     [Expand support for indirections. 1) Implement...
3     ["Can't Rename, no element found" on anything ...
4     [<!-- CLA-CHECK:130 -->\n&#10060; Author of th...
                            ...                        
95                                 [Superseded by #20.]
96    [Bump rails-html-sanitizer from 1.2.0 to 1.4.4...
97    [Bump rails-html-sanitizer from 1.2.0 to 1.4.4...
98                                 [Superseded by #23.]
99    [Bump express from 4.17.2 to 4.17.3 in /server...
Name: post_list, Length: 100, dtype: object

In [7]:
df= pd.DataFrame(df)

# Create a new column 'conversation_id' with unique IDs for each conversation
df['conversation_id'] = range(len(df))

# Explode the 'post_list' column into separate rows
df = df.explode('post_list', ignore_index=True)

# Rearrange the columns to have 'conversation_id' come first
df = df[['conversation_id', 'post_list']]

# Rename the 'post_list' column to 'post'
df = df.rename(columns={'post_list': 'post'})

df['post_working_language'] = np.nan
df['post_embedded_language'] = np.nan

# Display the final DataFrame
print(df)

# Specify the path where you want to save the CSV file
file_path = 'conv_sample_posts.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)  # Set index=False to exclude the index column

print(f'DataFrame has been saved to {file_path}')

       conversation_id                                               post  \
0                    0  Update dependency @testing-library/user-event ...   
1                    1                 I would like to work on this issue   
2                    1                                         go for it!   
3                    1  Looks like there is a potential PoC and Docker...   
4                    1  After looking into this further I think it wil...   
...                ...                                                ...   
29924            19995  Bump mongoose from 5.9.25 to 5.13.15. Bumps [m...   
29925            19996  Bump puppeteer from 19.9.0 to 19.10.0. Bumps [...   
29926            19997  3Dモデル用シェーダー作成. 3Dモデルで使用するシェーダーを作成する。\r\n\r\n要件...   
29927            19998  ### <span aria-hidden="true">👷</span> Deploy P...   
29928            19999                                 Feature/with front   

       post_working_language  post_embedded_language  
0                   

In [8]:
# API key: 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

In [9]:
import openai

In [10]:
df= pd.read_csv("conv_sample_posts.csv")

In [11]:
# Dr. Lu's API key
openai.api_key= "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"

In [12]:
def get_working_language_prompt(post):
    """
    Input: text string
    Output: prompt containing the text string requesting classification
    """
    # embed a (possibly truncated) post from a conversation inside a prompt that will be used to query the ChatGPT API
    prompt= f'What natural language(s) are present in this post: "{post}". Please reply with the language(s) separated by commas and no additional commentary or punctuation.'
    return prompt

In [13]:
def get_classification(post):
    """
    This function performs the ChatGPT API query
    Input: text string
    Output: classification as a string, with commas separating languages
    """
    # get the response, which is a OpenAIObject containing a JSON string
    response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo", 
      messages=[{"role": "user", "content": get_working_language_prompt(post)}]
    )
    
    # get the 'content' value from the object, which contains the language classification
    classification = response.choices[0].message.content

    # return the classification, which is a string that may contain more than one language, separated by commas
    return classification

In [14]:
def get_conversation_length(conversation_id):
    """
    Get the total number of posts given a conversation_id
    """
    # Filter the DataFrame to include only rows with the given conversation_id
    filtered_df = df[df['conversation_id'] == conversation_id]
    
    # Get the total number of posts for the conversation_id
    total_posts = len(filtered_df)
    
    return total_posts

In [15]:
def truncate_post(post, n):
    """
    Truncate a string to a maximum number of tokens (n)
    """
    # Tokenize the input string into words
    words = word_tokenize(post)
    
    # Select the first n tokens
    truncated_tokens = words[:n]
    
    # Join the selected tokens back into a string
    truncated_post = ' '.join(truncated_tokens)
    
    return truncated_post

In [16]:
# iterate through a range of conversation data points
for i in range(10):
    
    conversation_id= df.at[i, 'conversation_id']
    print(f"• Examining data point {i}, with conversation_id {conversation_id}")
    
    if pd.isna(df['post_working_language'].iloc[i]):

        # see how long the conversation is
        conversation_length= get_conversation_length(conversation_id)
        print(f"----Data point {i} is part of a conversation of length {conversation_length}")

        # initialize a set to store the languages associated with the post
        languages= set()

        # get the post content
        post= df.at[i, 'post']
        # truncate the post
        truncated_post= truncate_post(post, 25)

        # print out the truncated post
        print(f"--------Post content: {truncated_post}")

        # get the language classification for the text
        print(f"--------Getting the language classification...")
        classification= get_classification(truncated_post) # <- HERE IS WHERE THE API QUERY TAKES PLACE
        # classification= "English"
        print(f"--------Classification is: {classification}")

        # split the languages into a list, since 'classification' may be a string with languages separated by commas
        classification= classification.split(",")
        # add each language to the set of languages associated with the conversation
        for language in classification:
            languages.add(language)

        # print out the set of languages associated with the conversation data point
        print(f"--------The language(s) used in data point {i} are: {languages}")

        # associate the languages with the conversation data point in a new column
        df.loc[i, "post_working_language"]= ", ".join(map(str, languages))

        print("\n")
    
    else:
        print(f"----Data point {i} already has a working language classification!")
        
# Specify the path where you want to save the CSV file
file_path = 'conv_sample_posts.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)  # Set index=False to exclude the index column

print(f'DataFrame has been saved to {file_path}')

• Examining data point 0, with conversation_id 0
----Data point 0 is part of a conversation of length 1
--------Post content: Update dependency @ testing-library/user-event to v14 . [ ! [ Mend Renovate ] ( https : //app.renovatebot.com/images/banner.svg ) ] ( https : //renovatebot.com ) This
--------Getting the language classification...
--------Classification is: English
--------The language(s) used in data point 0 are: {'English'}


• Examining data point 1, with conversation_id 1
----Data point 1 is part of a conversation of length 5
--------Post content: I would like to work on this issue
--------Getting the language classification...
--------Classification is: English
--------The language(s) used in data point 1 are: {'English'}


• Examining data point 2, with conversation_id 1
----Data point 2 is part of a conversation of length 5
--------Post content: go for it !
--------Getting the language classification...
--------Classification is: English
--------The language(s) used in da

In [17]:
# Set the max_rows option temporarily to display all 100 rows
with pd.option_context('display.max_rows', None):
    display(df.head(100))

Unnamed: 0,conversation_id,post,post_working_language,post_embedded_language
0,0,Update dependency @testing-library/user-event ...,English,
1,1,I would like to work on this issue,English,
2,1,go for it!,English,
3,1,Looks like there is a potential PoC and Docker...,English,
4,1,After looking into this further I think it wil...,English,
5,1,I spoke with @catc0n about this and she had ag...,English.,
6,2,Expand support for indirections. 1) Implement ...,English.,
7,2,"can you explain why we dont need this anymore,...",English,
8,2,The flag is only relevant for stores i. e. `ST...,English,
9,2,"oh yes, thanks",English,


TODO:

- Add embedded language prompt