# Characteristic Analysis for LLMs

## Five Sentences

In [4]:
from Model_Evaluation.load_and_preprocess import load_and_preprocess_data, preprocess_data, create_data_loaders, TextDataset
import numpy as np
import random 
import pandas as pd

# SEED FIXED
SEED = 20241006
np.random.seed(SEED)
random.seed(SEED)

file_paths = ['DataSet/X_ij_samples_5k_Claude-3-haiku_5sentences.csv', 
              # 'DataSet/X_ij_samples_5k_Falcon-7b.csv', 
              'DataSet/X_ij_samples_5k_Qwen2.5-3B-Instruct_5sentences.csv',
              'DataSet/X_ij_samples_5k_GPT-4o-mini_5sentences.csv',
              'DataSet/X_ij_samples_5k_Llama-3.2-3B-Instruct_5sentences.csv',
              # 'DataSet/X_ij_samples_5k_Llama-2-7b-chat.csv',
              'DataSet/X_ij_samples_5k_Phi-3-Mini-4K_5sentences.csv']

X_train, X_test, X_val, y_train, y_test, y_val, label_encoder, combined_df, combined_df_temp = load_and_preprocess_data(file_paths)

Sample counts for each LLM model:
Claude-3-haiku:
  Train: 3102
  Val: 775
  Test:  969

GPT-4o-mini:
  Train: 3047
  Val: 762
  Test:  952

Llama-3.2-3B-Instruct:
  Train: 3174
  Val: 794
  Test:  993

Phi-3-Mini-4K:
  Train: 3198
  Val: 800
  Test:  1000

Qwen2.5-3B-Instruct:
  Train: 2979
  Val: 745
  Test:  931



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['LLM_encoded'] = le.fit_transform(combined_df['LLM'])


In [5]:
def restore_test_dataframe(X_test, y_test, label_encoder):
    
    y_test_labels = label_encoder.inverse_transform(y_test)
    
    restored_df = pd.DataFrame({
        'processed_text': X_test,  
        'LLM': y_test_labels       
    })
    
    return restored_df

restored_test_df = restore_test_dataframe(X_test, y_test, label_encoder)

print(restored_test_df.head())

                                      processed_text                    LLM
0  but they all looked bland on my kitchen counte...  Llama-3.2-3B-Instruct
1  my world before you came and turned everything...  Llama-3.2-3B-Instruct
2  after two hours , she was still working on her...    Qwen2.5-3B-Instruct
3  the rest of the dress hugged my figure beautif...    Qwen2.5-3B-Instruct
4  her eyes whipped around the room, searching fo...         Claude-3-haiku


In [6]:
df = combined_df.reset_index(drop=True)
# df = restored_test_df.reset_index(drop=True)

### N-gram Frequency Analysis (Bigram)

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')  
X = vectorizer.fit_transform(df['processed_text'])

ngram_freq = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
ngram_freq['LLM'] = df['LLM']

for llm in df['LLM'].unique():
    print(f"\nTop Bigrams for {llm}:")
    llm_ngrams = ngram_freq[ngram_freq['LLM'] == llm].drop('LLM', axis=1).sum().sort_values(ascending=False).head(5)
    print(llm_ngrams)



Top Bigrams for Claude-3-haiku:
deep breath       199
felt sense        195
moving forward    131
hard work         127
couldn help       126
dtype: int64

Top Bigrams for Qwen2.5-3B-Instruct:
felt like              288
yesterday evening      286
yesterday morning      184
yesterday afternoon    112
feels like              92
dtype: int64

Top Bigrams for GPT-4o-mini:
deep breath    292
took deep      176
felt sense     172
filled air     127
hard work      123
dtype: int64

Top Bigrams for Llama-3.2-3B-Instruct:
couldn help    225
felt like      116
living room     94
best friend     85
coffee shop     74
dtype: int64

Top Bigrams for Phi-3-Mini-4K:
felt sense       131
living room      119
couldn help      115
end day          109
floor cleaner     93
dtype: int64


### Topic Modeling

In [8]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english', max_features=100)
count_data = count_vectorizer.fit_transform(df['processed_text'])

lda = LatentDirichletAllocation(n_components=4, random_state=20241016)
lda.fit(count_data)

topic_distribution = lda.transform(count_data)
df['Topic_1'] = topic_distribution[:, 0]
df['Topic_2'] = topic_distribution[:, 1]
df['Topic_3'] = topic_distribution[:, 2]
df['Topic_4'] = topic_distribution[:, 3]
# print(df[['LLM', 'Topic_1', 'Topic_2']])

for idx, topic in enumerate(lda.components_):
    print(f"\nTopic {idx+1}:")
    print([count_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])



Topic 1:
['home', 'yesterday', 'friend', 'today', 'soon', 'just', 'work', 'quickly', 'time', 'like']

Topic 2:
['clear', 'people', 'challenges', 'sense', 'life', 'felt', 'experience', 'mind', 'despite', 'ultimately']

Topic 3:
['help', 'make', 'home', 'family', 'work', 'friends', 'decided', 'day', 'time', 'new']

Topic 4:
['park', 'away', 'laughter', 'day', 'air', 'felt', 'eyes', 'filled', 'moment', 'room']


In [9]:
# LLM별 주제 분포 평균 계산
average_topic_distribution = df.groupby('LLM')[['Topic_1', 'Topic_2', 'Topic_3', 'Topic_4']].mean().reset_index()

print("average_topic_distribution:")
print(average_topic_distribution)


average_topic_distribution:
                     LLM   Topic_1   Topic_2   Topic_3   Topic_4
0         Claude-3-haiku  0.163721  0.326478  0.245243  0.264559
1            GPT-4o-mini  0.141053  0.291688  0.223890  0.343369
2  Llama-3.2-3B-Instruct  0.273114  0.170729  0.296453  0.259704
3          Phi-3-Mini-4K  0.185405  0.213478  0.306135  0.294982
4    Qwen2.5-3B-Instruct  0.393773  0.132102  0.275452  0.198673


## Single Sentence

In [10]:
from Model_Evaluation.load_and_preprocess import load_and_preprocess_data, preprocess_data, create_data_loaders, TextDataset
import numpy as np
import random 

# SEED FIXED
SEED = 20241006
np.random.seed(SEED)
random.seed(SEED)

file_paths = ['DataSet/X_ij_samples_5k_Claude-3-haiku.csv', 
              'DataSet/X_ij_samples_5k_Falcon-7b.csv', 
              'DataSet/X_ij_samples_5k_Qwen2.5-3B-Instruct.csv',
              'DataSet/X_ij_samples_5k_GPT-4o-mini.csv',
              'DataSet/X_ij_samples_5k_Llama-3.2-3B-Instruct.csv',
              'DataSet/X_ij_samples_5k_Llama-2-7b-chat.csv',
              'DataSet/X_ij_samples_5k_Phi-3-Mini-4K.csv']

X_train, X_test, X_val, y_train, y_test, y_val, label_encoder, combined_df_1sen, combined_df_temp = load_and_preprocess_data(file_paths)

Sample counts for each LLM model:
Claude-3-haiku:
  Train: 3179
  Val: 795
  Test:  993

Falcon-7b:
  Train: 3168
  Val: 792
  Test:  990

GPT-4o-mini:
  Train: 3198
  Val: 800
  Test:  1000

Llama-2-7b-chat:
  Train: 3182
  Val: 796
  Test:  994

Llama-3.2-3B-Instruct:
  Train: 3198
  Val: 799
  Test:  1000

Phi-3-Mini-4K:
  Train: 3197
  Val: 800
  Test:  1000

Qwen2.5-3B-Instruct:
  Train: 3178
  Val: 794
  Test:  993



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['LLM_encoded'] = le.fit_transform(combined_df['LLM'])


### N-gram Frequency Analysis (Bigram)

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df2 = combined_df_1sen.reset_index(drop=True)

vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')  
X = vectorizer.fit_transform(df2['processed_text'])
ngram_freq = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
ngram_freq['LLM'] = df2['LLM']

for llm in df2['LLM'].unique():
    print(f"\nTop Bigrams for {llm}:")
    llm_ngrams = ngram_freq[ngram_freq['LLM'] == llm].drop('LLM', axis=1).sum().sort_values(ascending=False).head(5)
    print(llm_ngrams)



Top Bigrams for Claude-3-haiku:
late meeting        33
surprise party      25
deep breath         22
beautiful sunset    21
felt sense          18
dtype: int64

Top Bigrams for Falcon-7b:
walked away          30
deep breath          27
gravity situation    25
sight behold         23
realized gravity     18
dtype: int64

Top Bigrams for Qwen2.5-3B-Instruct:
yesterday evening    84
yesterday morning    28
dinner tonight       19
work today           18
late appointment     15
dtype: int64

Top Bigrams for GPT-4o-mini:
felt sense          30
beautiful sunset    29
finish homework     22
deep breath         22
finish project      20
dtype: int64

Top Bigrams for Llama-3.2-3B-Instruct:
high school       35
best friend       35
living room       34
birthday party    32
felt like         31
dtype: int64

Top Bigrams for Llama-2-7b-chat:
home work            23
late work            21
finished homework    19
video games          19
walked away          16
dtype: int64

Top Bigrams for Phi-3-M

In [9]:
def calculate_ttr(text):
    tokens = text.split()
    return len(set(tokens)) / len(tokens)

combined_df_1sen['TTR'] = combined_df_1sen['processed_text'].apply(calculate_ttr)
print(df[['LLM', 'TTR']])


                  LLM       TTR
0      Claude-3-haiku  0.769231
1      Claude-3-haiku  0.833333
2      Claude-3-haiku  0.813559
3      Claude-3-haiku  0.803030
4      Claude-3-haiku  0.788732
...               ...       ...
24995   Phi-3-Mini-4K  0.800000
24996   Phi-3-Mini-4K  0.859649
24997   Phi-3-Mini-4K  0.836066
24998   Phi-3-Mini-4K  0.847826
24999   Phi-3-Mini-4K  0.900000

[24221 rows x 2 columns]


In [11]:
average_metrics = combined_df_1sen.groupby('LLM').mean('TTR').reset_index()
average_metrics[['LLM', 'TTR']]

Unnamed: 0,LLM,TTR
0,Claude-3-haiku,0.961883
1,GPT-4o-mini,0.958474
2,Llama-3.2-3B-Instruct,0.972893
3,Phi-3-Mini-4K,0.946084
4,Qwen2.5-3B-Instruct,0.977158


Vocabulary Richness

In [None]:
def calculate_ttr(text):
    tokens = text.split()
    return len(set(tokens)) / len(tokens)

df['TTR'] = df['processed_text'].apply(calculate_ttr)
print(df[['LLM', 'TTR']])



In [None]:
average_metrics = df.groupby('LLM').mean('TTR').reset_index()
average_metrics[['LLM', 'TTR', 'NOUN', 'VERB', 'ADJ', 'ADV']]

In [None]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

def pos_tag_distribution(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    total = sum(pos_counts.values())
    return {pos: count / total for pos, count in pos_counts.items()}

pos_distribution = df['processed_text'].apply(pos_tag_distribution).apply(pd.Series).fillna(0)
df = pd.concat([df, pos_distribution], axis=1)
print(df[['LLM', 'NOUN', 'VERB', 'ADJ', 'ADV']])

In [None]:
!pip install textblob

In [None]:
from textblob import TextBlob

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df[['Sentiment_Polarity', 'Sentiment_Subjectivity']] = df['processed_text'].apply(lambda x: pd.Series(get_sentiment(x)))
print(df[['LLM', 'Sentiment_Polarity', 'Sentiment_Subjectivity']])

In [None]:

grouped_stats = df.groupby('LLM').agg({
    'Sentiment_Polarity': ['mean', 'std', 'min', 'max', 'median', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)],
    'Sentiment_Subjectivity': ['mean', 'std', 'min', 'max', 'median', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)]
})


grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values]

print(grouped_stats)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(df['processed_text'].tolist())

similarity_matrix = cosine_similarity(embeddings)

similarity_df = pd.DataFrame(similarity_matrix, index=df['LLM'], columns=df['LLM'])
# print(similarity_df)


In [None]:

similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)

for llm in df['LLM'].unique():

    indices = df[df['LLM'] == llm].index
    

    llm_similarity_matrix = similarity_df.loc[indices, indices]
    

    print(f"\nSimilarity Matrix for {llm}:")
    print(llm_similarity_matrix)

In [None]:
embeddings = np.array(embeddings)  

grouped_embeddings = df.groupby('LLM').apply(lambda x: embeddings[x.index - df.index[0]].mean(axis=0))

group_similarity_matrix = cosine_similarity(grouped_embeddings.tolist())

group_similarity_df = pd.DataFrame(group_similarity_matrix, index=df['LLM'].unique(), columns=df['LLM'].unique())

print("\nLLM Group Similarity Matrix:")
print(group_similarity_df)