In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
# file containing the conversation data
file_path= 'conv_sample'

In [3]:
# initialize lists to store data for DataFrame creation.
post_lists= []
num_posts_list= []

# open the file in read mode ('r') using a 'with' statement.
with open(file_path, 'r') as file:
    
    # iterate through each line in the file.
    for line in file:
        # parse the JSON string into a Python dictionary.
        data= json.loads(line)
        
        # extract the "post_list" field and iterate through its items.
        post_list= data.get("post_list", [])
        
        # collect "text" fields and their counts for each "post_list".
        text_fields= [post.get("text", "") for post in post_list]
        num_posts= len(text_fields)
        
        # append data to the lists.
        post_lists.append(text_fields)
        num_posts_list.append(num_posts)

# create a Pandas DataFrame.
df= pd.DataFrame({'post_list': post_lists, 'num_posts': num_posts_list})

# print the DataFrame.
print(df)

                                               post_list  num_posts
0      [Update dependency @testing-library/user-event...          1
1      [I would like to work on this issue, go for it...          5
2      [Expand support for indirections. 1) Implement...          4
3      ["Can't Rename, no element found" on anything ...         20
4      [<!-- CLA-CHECK:130 -->\n&#10060; Author of th...          1
...                                                  ...        ...
19995  [Bump mongoose from 5.9.25 to 5.13.15. Bumps [...          1
19996  [Bump puppeteer from 19.9.0 to 19.10.0. Bumps ...          1
19997  [3Dモデル用シェーダー作成. 3Dモデルで使用するシェーダーを作成する。\r\n\r\n要...          1
19998  [### <span aria-hidden="true">👷</span> Deploy ...          1
19999                               [Feature/with front]          1

[20000 rows x 2 columns]


In [4]:
# calculate the mean of the 'num_posts' column
mean_num_posts= df['num_posts'].mean()

# print the mean
print(f"Mean of num_posts (Mean number of texts per conversation): {mean_num_posts}")

Mean of num_posts (Mean number of texts per conversation): 1.49645


In [5]:
# see the first 100 conversations
df["post_list"][:100]

0     [Update dependency @testing-library/user-event...
1     [I would like to work on this issue, go for it...
2     [Expand support for indirections. 1) Implement...
3     ["Can't Rename, no element found" on anything ...
4     [<!-- CLA-CHECK:130 -->\n&#10060; Author of th...
                            ...                        
95                                 [Superseded by #20.]
96    [Bump rails-html-sanitizer from 1.2.0 to 1.4.4...
97    [Bump rails-html-sanitizer from 1.2.0 to 1.4.4...
98                                 [Superseded by #23.]
99    [Bump express from 4.17.2 to 4.17.3 in /server...
Name: post_list, Length: 100, dtype: object

In [6]:
df= pd.DataFrame(df)

# Create a new column 'conversation_id' with unique IDs for each conversation
df['conversation_id'] = range(len(df))

# Explode the 'post_list' column into separate rows
df = df.explode('post_list', ignore_index=True)

# Rearrange the columns to have 'conversation_id' come first
df = df[['conversation_id', 'post_list']]

# Rename the 'post_list' column to 'post'
df = df.rename(columns={'post_list': 'post'})

df['post_working_language'] = np.nan
df['post_embedded_language'] = np.nan
df['post_working_language_actual'] = np.nan
df['post_embedded_language_actual'] = np.nan

# Display the final DataFrame
print(df)

# # Specify the path where you want to save the CSV file
# file_path = 'conv_sample_posts.csv'

# # Save the DataFrame to a CSV file
# df.to_csv(file_path, index=False)  # Set index=False to exclude the index column

# print(f'DataFrame has been saved to {file_path}')

       conversation_id                                               post  \
0                    0  Update dependency @testing-library/user-event ...   
1                    1                 I would like to work on this issue   
2                    1                                         go for it!   
3                    1  Looks like there is a potential PoC and Docker...   
4                    1  After looking into this further I think it wil...   
...                ...                                                ...   
29924            19995  Bump mongoose from 5.9.25 to 5.13.15. Bumps [m...   
29925            19996  Bump puppeteer from 19.9.0 to 19.10.0. Bumps [...   
29926            19997  3Dモデル用シェーダー作成. 3Dモデルで使用するシェーダーを作成する。\r\n\r\n要件...   
29927            19998  ### <span aria-hidden="true">👷</span> Deploy P...   
29928            19999                                 Feature/with front   

       post_working_language  post_embedded_language  \
0                  