<a href="https://colab.research.google.com/github/chiranjibghosh551/screening/blob/master/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd

# Load dataset with the correct encoding
data = pd.read_csv('Articles.csv', encoding='latin-1')  # Try 'latin-1' or 'cp1252'
print(data.head())

                                             Article      Date  \
0  KARACHI: The Sindh government has decided to b...  1/1/2015   
1  HONG KONG: Asian markets started 2015 on an up...  1/2/2015   
2  HONG KONG:  Hong Kong shares opened 0.66 perce...  1/5/2015   
3  HONG KONG: Asian markets tumbled Tuesday follo...  1/6/2015   
4  NEW YORK: US oil prices Monday slipped below $...  1/6/2015   

                                             Heading  NewsType  
0  sindh govt decides to cut public transport far...  business  
1                    asia stocks up in new year trad  business  
2           hong kong stocks open 0.66 percent lower  business  
3             asian stocks sink euro near nine year   business  
4                 us oil prices slip below 50 a barr  business  


In [10]:
import re
from sklearn.model_selection import train_test_split

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    return text

# Check for possible typos or variations in the column name
# and replace 'Article' with the correct column name if needed
data['cleaned_text'] = data['Article'].apply(clean_text)

# Split the data
X = data['cleaned_text']
# Check the DataFrame columns for the correct name of the target variable.
print(data.columns)
# Assuming 'NewsType' is the target column - this may need to be changed.
y = data['NewsType']  # Changed 'Section to "NewsType" based on the available columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Index(['Article', 'Date', 'Heading', 'NewsType', 'cleaned_text'], dtype='object')


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)


In [13]:
# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print(confusion_mat)



              precision    recall  f1-score   support

    business       0.99      0.99      0.99       262
      sports       0.99      0.99      0.99       277

    accuracy                           0.99       539
   macro avg       0.99      0.99      0.99       539
weighted avg       0.99      0.99      0.99       539

[[259   3]
 [  3 274]]


In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

print("Best parameters:", grid_search.best_params_)




Best parameters: {'C': 10, 'solver': 'liblinear'}




In [15]:
!pip install Flask
!pip install pyngrok



Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [16]:
from flask import Flask, request, render_template
from pyngrok import ngrok
import numpy as np
import re
import pandas as pd

# Load your model and vectorizer here (assuming you've saved them)
# from joblib import load
# model = load('your_model.pkl')
# vectorizer = load('your_vectorizer.pkl')

app = Flask(__name__)

@app.route('/')
def home():
    return '''
        <form action="/predict" method="post">
            <textarea name="text" rows="4" cols="50" placeholder="Enter your text here..."></textarea>
            <br>
            <input type="submit" value="Classify">
        </form>
    '''

@app.route('/predict', methods=['POST'])
def predict():
    text = request.form['text']
    cleaned_text = clean_text(text)
    tfidf_text = vectorizer.transform([cleaned_text])  # Assuming you have a vectorizer
    prediction = model.predict(tfidf_text)  # Assuming you have a model
    return f'The predicted news type is: <strong>{prediction[0]}</strong>'

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [23]:
!ngrok config add-authtoken 2mjdQRlHSpRTSF2RQvB6BUYbJ5m_26GJYA4pKAKr4iqacQtps

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok, conf

# Use the correct format for the authentication token
conf.get_default().auth_token = "2mjdQRlHSpRTSF2RQvB6BUYbJ5m_26GJYA4pKAKr4iqacQtps"

# Start the ngrok tunnel
public_url = ngrok.connect(5000)
print(f" * ngrok: {public_url}")
app.run(port=5000)

 * ngrok: NgrokTunnel: "https://0233-35-231-230-9.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:14:15] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:14:15] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:14:38] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:15:19] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:15:29] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:16:05] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:16:13] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:16:26] "POST /predict HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Sep/2024 07:16:35] "POST /predict HTTP/1.1" 200 -


In [25]:

app.run(port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
