### Changing to the main directory

In [1]:
%cd ..

/home/isham/Desktop/machine-learning-projects/text-classification/few-shot-prompting-text-classification


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### Import Necessary Libraries

In [2]:
import pandas as pd
from utils import combine_dataframes, compute_multiclass_metrics, extract_word, get_most_common_word

In [3]:
# df = pd.read_csv('data/news_category_classification/test-examples_1.csv')
multiple_csvs = [pd.read_csv(f'data/news_category_classification/test-examples_{i+1}.csv') for i in range(10)]

In [4]:
combined_df = combine_dataframes(multiple_csvs)
combined_df.reset_index(drop=True, inplace=True)

In [5]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   category                          500 non-null    object
 1   text                              500 non-null    object
 2   predicted_category_llama3         500 non-null    object
 3   predicted_category_llama3:70b     500 non-null    object
 4   predicted_category_mixtral        500 non-null    object
 5   predicted_category_gpt-3.5-turbo  500 non-null    object
dtypes: object(6)
memory usage: 23.6+ KB


In [6]:
combined_df.isna().sum()

category                            0
text                                0
predicted_category_llama3           0
predicted_category_llama3:70b       0
predicted_category_mixtral          0
predicted_category_gpt-3.5-turbo    0
dtype: int64

Minor cleaning required for extracting the predicted category as per the investigation.

### Minor Data Cleaning

As we could see from logs, we didnt get single output answer for many inferences. Instead of giving us one word, it was able to classify as per the list given but continued to give explanation though it was told to the model not to explain just to give one word answer. Therefore, for fair results, we need to do some minor clean to check whether the output is within one of the 5 categories or not.

#### `gpt-3.5-turbo`

In [7]:
combined_df['predicted_category_gpt-3.5-turbo'].value_counts()

predicted_category_gpt-3.5-turbo
output: business         116
output: sport            111
output: politics         108
output: entertainment     93
output: tech              71
output: health             1
Name: count, dtype: int64

In [8]:
combined_df['predicted_category_gpt-3.5-turbo'].apply(lambda text: text.replace("output: ", "")).str.strip().value_counts()

predicted_category_gpt-3.5-turbo
business         116
sport            111
politics         108
entertainment     93
tech              71
health             1
Name: count, dtype: int64

In [9]:
combined_df['predicted_category_gpt-3.5-turbo'] = combined_df['predicted_category_gpt-3.5-turbo'].apply(lambda text: text.replace("output: ", "")).str.strip()

In [10]:
combined_df['predicted_category_gpt-3.5-turbo'].value_counts()

predicted_category_gpt-3.5-turbo
business         116
sport            111
politics         108
entertainment     93
tech              71
health             1
Name: count, dtype: int64

#### `llama3:8B`

In [11]:
combined_df['predicted_category_llama3'].value_counts()

predicted_category_llama3
politics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

#### `llama3:70b`

In [12]:
combined_df['predicted_category_llama3:70b'].value_counts()

predicted_category_llama3:70b
politics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             113
sport                            

In [13]:
import re 
def find_first_occurrence(text):
    keywords = {
        'tech': ['tech', 'technology'],
        'business': ['business'],
        'politics': ['politics'],
        'entertainment': ['entertainment'],
        'sport': ['sport']
    }
    
    text = text.lower()
    
    for category, synonyms in keywords.items():
        pattern = r'\b(?:' + '|'.join(synonyms) + r')\b'
        match = re.search(pattern, text)
        if match:
            return category
    
    return text

In [14]:
combined_df['predicted_category_llama3:70b'].apply(find_first_occurrence).value_counts()

predicted_category_llama3:70b
politics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   113
sport                                                                                                                                                                                                                                                                                                      

In [15]:
combined_df['predicted_category_llama3:70b'] = combined_df['predicted_category_llama3:70b'].apply(find_first_occurrence)

#### `mixtral:8x7B`

In [16]:
combined_df['predicted_category_mixtral'].value_counts()

predicted_category_mixtral
sport                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [17]:
import re

def extract_category_after_colon(text):
    categories = ['tech', 'sport', 'entertainment', 'business', 'politics']
    text = text.replace("\"", "").replace("*", "")
    pattern = r':\s*(\w+)'
    match = re.search(pattern, text)
    
    if match:
        extracted_word = match.group(1).lower()
        if extracted_word in categories:
            return extracted_word
        else: return text
    else:
        return text

In [18]:
combined_df['predicted_category_mixtral'].apply(extract_category_after_colon).str.strip().value_counts()

predicted_category_mixtral
sport                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [19]:
combined_df['predicted_category_mixtral'].apply(extract_category_after_colon).apply(find_first_occurrence).str.strip().value_counts()

predicted_category_mixtral
sport                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [20]:
combined_df['predicted_category_mixtral'] = combined_df['predicted_category_mixtral'].apply(extract_category_after_colon).apply(find_first_occurrence).str.strip()

In [21]:
for column in combined_df.columns[2:]:
    print(combined_df[column].unique())

['politics' 'sport' 'entertainment' 'tech' 'business'
 'this text appears to be a rant or complaint about premium rate phone scams, where individuals are tricked into making expensive calls without realizing it. the author expresses frustration with the phone companies for not doing enough to prevent these scams and for charging customers for fraudulent transactions.\n\nthe main points made by the author include:\n\n* phone companies should take more responsibility for preventing these scams and refunding customers who have been victimized.\n* customers need to be more aware of how to avoid falling prey to these scams, such as being cautious when downloading software or clicking on links.\n* the phone companies are profiteering from these scams by charging customers for fraudulent transactions.\n* the government should take action to prevent these scams and reimburse victims who have been charged unfairly.\n\nthe tone of the text is critical and frustrated, with the author expressing a

In [22]:
def process_text(text):
    splitted_text = text.lower().split()
    if len(splitted_text) > 2:
        return "others"
    elif text not in ['politics', 'sport', 'business', 'entertainment', 'tech']:
        return "others"
    return text

In [23]:
combined_df.columns

Index(['category', 'text', 'predicted_category_llama3',
       'predicted_category_llama3:70b', 'predicted_category_mixtral',
       'predicted_category_gpt-3.5-turbo'],
      dtype='object')

In [24]:
combined_df['predicted_category_llama3'] = combined_df['predicted_category_llama3'].apply(process_text)
combined_df['predicted_category_llama3:70b'] = combined_df['predicted_category_llama3:70b'].apply(process_text)
combined_df['predicted_category_gpt-3.5-turbo'] = combined_df['predicted_category_gpt-3.5-turbo'].apply(process_text)
combined_df['predicted_category_mixtral'] = combined_df['predicted_category_mixtral'].apply(process_text)

In [25]:
for column in combined_df.columns[2:]:
    print(combined_df[column].unique())

['politics' 'sport' 'entertainment' 'tech' 'business' 'others']
['politics' 'sport' 'entertainment' 'tech' 'business' 'others']
['politics' 'sport' 'entertainment' 'business' 'tech' 'others']
['politics' 'business' 'sport' 'entertainment' 'tech' 'others']


In [26]:
final_df = combined_df.copy()

In [27]:
final_df['predicted_category_llama3'] = final_df['predicted_category_llama3'].str.strip()
final_df['predicted_category_llama3:70b'] = final_df['predicted_category_llama3:70b'].str.strip()
final_df['predicted_category_mixtral'] = final_df['predicted_category_mixtral'].str.strip()
final_df['predicted_category_gpt-3.5-turbo'] = final_df['predicted_category_gpt-3.5-turbo'].str.strip()

In [28]:
final_df.isna().sum()

category                            0
text                                0
predicted_category_llama3           0
predicted_category_llama3:70b       0
predicted_category_mixtral          0
predicted_category_gpt-3.5-turbo    0
dtype: int64

In [29]:
final_df['predicted_category_llama3'].value_counts() 

predicted_category_llama3
politics         129
sport            112
entertainment    107
business          98
tech              50
others             4
Name: count, dtype: int64

In [30]:
final_df['predicted_category_llama3:70b'].value_counts() 

predicted_category_llama3:70b
politics         113
sport            112
business         110
entertainment     95
tech              69
others             1
Name: count, dtype: int64

In [31]:
final_df['predicted_category_mixtral'].value_counts() 

predicted_category_mixtral
sport            137
politics         107
business         102
entertainment     86
tech              65
others             3
Name: count, dtype: int64

In [32]:
final_df['predicted_category_gpt-3.5-turbo'].value_counts() 

predicted_category_gpt-3.5-turbo
business         116
sport            111
politics         108
entertainment     93
tech              71
others             1
Name: count, dtype: int64

In [33]:
final_df['category'].value_counts() 

category
business         118
sport            108
politics         100
entertainment     88
tech              86
Name: count, dtype: int64

In [34]:
final_df

Unnamed: 0,category,text,predicted_category_llama3,predicted_category_llama3:70b,predicted_category_mixtral,predicted_category_gpt-3.5-turbo
0,politics,nhs waiting time target is cut hospital waitin...,politics,politics,politics,politics
1,politics,crisis ahead in social sciences a national b...,politics,politics,politics,business
2,tech,football manager scores big time for the past ...,sport,sport,sport,sport
3,politics,uk will stand firm on eu rebate britain s £3b...,politics,politics,politics,politics
4,entertainment,greer attacks bully big brother germaine gre...,entertainment,entertainment,sport,entertainment
...,...,...,...,...,...,...
495,politics,civil servants in strike ballot the uk s bigge...,politics,politics,politics,politics
496,entertainment,us tv special for tsunami relief a us televisi...,entertainment,entertainment,entertainment,entertainment
497,politics,guantanamo four free in weeks all four britons...,politics,politics,politics,politics
498,sport,chepkemei hit by big ban kenya s athletics bod...,sport,sport,sport,sport


### Final Evaluation


Multiple models, despite having different architectures, have generated identical outputs, even though the actual ground truth category differs. We will consider a category as correct if several models agree on it, minimizing any potential bias. This consensus category will then be assigned to the `correct_category` column.

In [35]:
# Apply the function to each row of the DataFrame
final_df['correct_category'] = final_df[['category', 'predicted_category_llama3', 'predicted_category_llama3:70b', 'predicted_category_mixtral', 'predicted_category_gpt-3.5-turbo']].apply(get_most_common_word, axis=1)

In [36]:
final_df

Unnamed: 0,category,text,predicted_category_llama3,predicted_category_llama3:70b,predicted_category_mixtral,predicted_category_gpt-3.5-turbo,correct_category
0,politics,nhs waiting time target is cut hospital waitin...,politics,politics,politics,politics,politics
1,politics,crisis ahead in social sciences a national b...,politics,politics,politics,business,politics
2,tech,football manager scores big time for the past ...,sport,sport,sport,sport,sport
3,politics,uk will stand firm on eu rebate britain s £3b...,politics,politics,politics,politics,politics
4,entertainment,greer attacks bully big brother germaine gre...,entertainment,entertainment,sport,entertainment,entertainment
...,...,...,...,...,...,...,...
495,politics,civil servants in strike ballot the uk s bigge...,politics,politics,politics,politics,politics
496,entertainment,us tv special for tsunami relief a us televisi...,entertainment,entertainment,entertainment,entertainment,entertainment
497,politics,guantanamo four free in weeks all four britons...,politics,politics,politics,politics,politics
498,sport,chepkemei hit by big ban kenya s athletics bod...,sport,sport,sport,sport,sport


In [37]:
final_df['correct_category'].value_counts()

correct_category
sport            112
politics         110
business         109
entertainment     92
tech              76
others             1
Name: count, dtype: int64

In [38]:
final_df.to_csv('data/news_category_classification/final_classified_outputs.csv', index=False)