# Feature Selection using Chi-Square

In [19]:
# Options and Hyperparameters
n_features = 100

In [18]:
# Imports
import pandas as pd
from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency

### Data Description
The data is being imported from the Feature Extraction team. The data is 20 text files containing the extracted features (i.e., words) from the 20 newsgroups dataset. The data is in the form of a list of words and their corresponding frequency in the document. Each line has a word and its frequency separated by a space. 

In [None]:
# Load the data
newsgroups_list = ['alt.atheism.txt','comp.graphics.txt','comp.os.ms-windows.misc.txt','alt.atheism.txt',
                'comp.graphics.txt','comp.os.ms-windows.misc.txt','comp.sys.ibm.pc.hardware.txt',
                'comp.sys.mac.hardware.txt','comp.windows.x.txt','misc.forsale.txt','rec.autos.txt',
                'rec.motorcycles.txt','rec.sport.baseball.txt','rec.sport.hockey.txt','sci.crypt.txt',
                'sci.electronics.txt','sci.med.txt','sci.space.txt','soc.religion.christian.txt',
                'talk.politics.guns.txt','talk.politics.mideast.txt','talk.politics.misc.txt',
                'talk.religion.misc.txt']

data = {}
for newsgroup in newsgroups_list:
    file_path = f"BoW/{newsgroup.replace('.txt', '_BoW.txt')}"
    data[newsgroup] = pd.read_csv(file_path, sep=' ', names=['word', 'frequency'])


In [None]:
# Build the word-by-newsgroup matrix
word_set = set(word for newsgroup in data for word in data[newsgroup]['word'])
word_newsgroup_matrix = pd.DataFrame(index=word_set, columns=newsgroups_list).fillna(0)

for newsgroup, df in data.items():
    for _, row in df.iterrows():
        word_newsgroup_matrix.loc[row['word'], newsgroup] = row['frequency']


In [None]:
# Compute Chi-Squared scores for each word
chi2_scores = []
for word in word_newsgroup_matrix.index:
    observed = word_newsgroup_matrix.loc[word].values
    total_counts = word_newsgroup_matrix.sum(axis=0).values
    total_word_count = observed.sum()
    expected = [(total_word_count * tc) / total_counts.sum() for tc in total_counts]
    
    chi2, p, _, _ = chi2_contingency([observed, expected])
    chi2_scores.append((word, chi2))

In [None]:
# Rank words by Chi-Squared score and save the top n_features
ranked_features = sorted(chi2_scores, key=lambda x: x[1], reverse=True)

top_features = [word for word, score in ranked_features[:n_features]]
with open("top_features.txt", "w") as f:
    f.write("\n".join(top_features))