In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

In [10]:
# Set the list of directories containing the text files
dir_paths = ['business','entertainment','politics','sport','tech']

# Initialize an empty list to store the data from the text files
data = []

# Loop through each directory path in the list
for dir_path in dir_paths:
    # Loop through each directory and file in the directory path
    for subdir, dirs, files in os.walk(dir_path):
        for file in files:
            # Check if the file is a text file
            if file.endswith('.txt'):
                # Construct the full file path
                file_path = os.path.join(subdir, file)
                # Read the data from the text file into a DataFrame
                try :
                    df = pd.read_csv(file_path, sep="\t", header=None,encoding='utf-8',on_bad_lines='skip')
                # Append the DataFrame to the data list
                    df['text'] = df
                    df['type'] = dir_path
                    data.append(df)
                except:
                    df = pd.read_csv(file_path, sep="\t", header=None,encoding='latin-1',on_bad_lines='skip')
                    df['text'] = df
                    df['type'] = dir_path
                # Append the DataFrame to the data list
                    data.append(df)

# Concatenate all DataFrames in the data list into a single DataFrame
result_df = pd.concat(data, ignore_index=True)

result_df['text'] = result_df['text'].apply(preprocess_text)


# Save to final file 

df = result_df.loc[:, ['text', 'type']]

df.to_csv('BBCText.csv', index=False)

In [7]:
# Define the preprocessing functions
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Perform stemming
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(token) for token in tokens]
    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Rejoin the tokens into a single string
    text = " ".join(tokens)
    return text

In [17]:
# Load the dataset
df = pd.read_csv('BBCText.csv')

# Replace NaN values with an empty string
df['text'].fillna('', inplace=True)

# Vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

# Train the model
y = df['type']
clf = MultinomialNB()
clf.fit(X, y)

joblib.dump(clf, 'model.joblib')

# Predict the label and score for a new text sample
new_text = "Apple has announced a new product launch event next month."
X_new = vectorizer.transform([new_text])
y_new = clf.predict(X_new)
score = clf.predict_proba(X_new)

print("Predicted label:", y_new[0])
print("Score:", max(score[0]))

Predicted label: tech
Score: 0.5950122820887729


In [65]:
def predict_text_classification(data):

    # Load pre-trained model from file
    with open('model.joblib', 'rb') as file:
        model = joblib.load(file)

    # Process new text for classification

    data['text'].apply(preprocess_text)

    processed_text = vectorizer.transform(data['text']) # Replace with appropriate preprocessing method

    # Predict class for new text
    predicted_class = clf.predict(processed_text)
    score = clf.predict_proba(processed_text)

    data['Predicted'] = predicted_class
    data['Score'] = [max(x) for x in score]
    return data

In [71]:
# define the Twitter search keyword
keyword = '(from:Bibi OR from:Netanyahu)'

# define the query to be executed
query = f' #{keyword} min_faves:100 lang:en'

# define an empty list to hold the tweets
tweets = []

# set a limit for the number of tweets to retrieve
limit = 100

# loop through the search results and append each tweet to the list
for tweet in sntwitter.TwitterSearchScraper(query).get_items():

    # check if the limit has been reached
    if len(tweets) == limit:
        break
    else:
        # append the tweet data to the list
        tweets.append([tweet.date, tweet.user.username, tweet.rawContent, keyword,
                       tweet.user, tweet.replyCount, tweet.retweetCount, 
                       tweet.likeCount , tweet.quoteCount , tweet.coordinates, 
                       tweet.place, tweet.hashtags, tweet.cashtags ])

# create a pandas dataframe from the list of tweets
data = pd.DataFrame(tweets, columns=['Date', 'User', 'text','keyword',
                          'user', 'replyCount', 'retweetCount',
                          'likeCount', 'quoteCount', 'coordinates',
                          'place', 'hashtags', 'cashtags',])



In [75]:
df = predict_text_classification(data)

df.to_csv('TweetsAnalysis.csv', index=False)

df

Unnamed: 0,Date,User,text,keyword,user,replyCount,retweetCount,likeCount,quoteCount,coordinates,place,hashtags,cashtags,Predicted,Score
0,2023-03-28 21:58:56+00:00,netanyahu,Israel is a sovereign country which makes its ...,(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,1209,1559,7598,625,,,,,business,0.969428
1,2023-03-28 21:58:56+00:00,netanyahu,My administration is committed to strengthenin...,(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,208,336,2864,48,,,,,politics,0.508427
2,2023-03-28 21:58:55+00:00,netanyahu,I have known President Biden for over 40 years...,(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,1299,745,5535,188,,,,,business,0.988589
3,2023-03-21 14:14:44+00:00,netanyahu,"Thank you Andrew Roberts, the premier biograph...",(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,182,111,705,16,,,,,entertainment,0.794985
4,2023-03-16 15:28:28+00:00,netanyahu,"Now, less than eighty years later, the represe...",(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,84,86,628,1,,,,,politics,0.963121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021-05-27 17:42:30+00:00,netanyahu,"This while depicting as the ""guilty party"" a d...",(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,352,728,4662,37,,,,,tech,0.460167
96,2021-05-27 17:42:29+00:00,netanyahu,"Once again, an immoral automatic majority at t...",(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,302,730,4824,44,,,,,politics,0.996932
97,2021-05-27 17:42:29+00:00,netanyahu,Today's shameful decision is yet another examp...,(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,1816,2279,11766,357,,,,,politics,0.996210
98,2021-05-19 14:29:16+00:00,netanyahu,I spoke with 70 foreign Ambassadors and diplom...,(from:Bibi OR from:Netanyahu),https://twitter.com/netanyahu,2126,1709,10068,211,,,,,sport,0.820403
