From 4f81dbab066b75805d55a1538ca6c629a2fc9637 Mon Sep 17 00:00:00 2001 From: devidnyk Date: Sat, 11 Oct 2025 23:59:20 +0530 Subject: [PATCH 1/7] Added search engine and removed keys --- src/Backend/AppContext.cs | 10 +- src/Backend/Backend.csproj | 3 + .../Controllers/JobSearchController.cs | 44 ++++++ src/Backend/Models/Internal/ScrappedJob.cs | 37 +++++ src/Backend/Models/Public/GSResult.cs | 132 ++++++++++++++++++ src/Backend/Operations/AIEngine.cs | 16 +++ src/Backend/Operations/GSEngine.cs | 126 +++++++++++++++++ src/Backend/Program.cs | 13 +- src/Backend/Properties/launchSettings.json | 10 ++ src/Backend/appsettings.json | 4 + 10 files changed, 383 insertions(+), 12 deletions(-) create mode 100644 src/Backend/Controllers/JobSearchController.cs create mode 100644 src/Backend/Models/Internal/ScrappedJob.cs create mode 100644 src/Backend/Models/Public/GSResult.cs create mode 100644 src/Backend/Operations/AIEngine.cs create mode 100644 src/Backend/Operations/GSEngine.cs diff --git a/src/Backend/AppContext.cs b/src/Backend/AppContext.cs index 8d97ddf..5950797 100644 --- a/src/Backend/AppContext.cs +++ b/src/Backend/AppContext.cs @@ -5,13 +5,15 @@ namespace Backend { public class AppContext { - public readonly Operations.DataProvider dataProvider; + public readonly DataProvider dataProvider; public readonly IConfiguration configuration; - public readonly ILogger logger; + public readonly ILogger logger; + public readonly GSEngine gsEngine; - public AppContext(CosmosClient cosmosClient, IConfiguration configuration, ILogger logger) + public AppContext(DataProvider _dataProvider, GSEngine _gsEngine, IConfiguration configuration, ILogger logger) { - this.dataProvider = new Operations.DataProvider(cosmosClient, configuration, logger); + this.dataProvider = _dataProvider; + this.gsEngine = _gsEngine; this.configuration = configuration; this.logger = logger; } diff --git a/src/Backend/Backend.csproj b/src/Backend/Backend.csproj index 15e787a..768ca68 100644 --- a/src/Backend/Backend.csproj +++ b/src/Backend/Backend.csproj @@ -7,6 +7,9 @@ + + + diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs new file mode 100644 index 0000000..8d1a32e --- /dev/null +++ b/src/Backend/Controllers/JobSearchController.cs @@ -0,0 +1,44 @@ +namespace Backend.Controllers +{ + using Microsoft.AspNetCore.Mvc; + + [ApiController] + public class JobSearchController : ControllerBase + { + private readonly AppContext appContext; + public JobSearchController(AppContext appContext) + { + this.appContext = appContext; + } + + [HttpGet] + [Route("/jobs/search")] + public async Task>> SearchJobs([FromQuery(Name = "q")] string query) + { + var gsEngine = this.appContext.gsEngine; + var result = await gsEngine.SearchAndScrapeJobsAsync(query); + if (result != null) + { + return Ok(result); + } + return StatusCode(500, "Error occurred while searching for jobs."); + } + + [HttpGet] + [Route("/jobs/latest")] + public ActionResult GetLatestJobs() + { + // Placeholder implementation for latest jobs + return Ok("Latest job postings"); + } + + [HttpGet] + [Route("/jobs/{id}")] + public ActionResult GetJobById(string id) + { + // Placeholder implementation for getting job by ID + return Ok($"Job details for ID: {id}"); + } + + } +} \ No newline at end of file diff --git a/src/Backend/Models/Internal/ScrappedJob.cs b/src/Backend/Models/Internal/ScrappedJob.cs new file mode 100644 index 0000000..9759d83 --- /dev/null +++ b/src/Backend/Models/Internal/ScrappedJob.cs @@ -0,0 +1,37 @@ +namespace Backend.Models.Internal +{ + public class ScrappedJob + { + public string jobId { get; set; } + public string title { get; set; } + public string displayLink { get; set; } + public string snippet { get; set; } + public string description { get; set; } + public string link { get; set; } + public DateTime scrappedTime { get; set; } + public List tags { get; set; } = new List(); + + public ScrappedJob() { } + public ScrappedJob(Models.Public.Item item, DateTime scrappedTime) + { + this.title = item.title; + this.displayLink = item.displayLink; + this.snippet = item.snippet; + this.link = item.link; + this.jobId = GenerateHashId(item.link, item.displayLink); + this.scrappedTime = scrappedTime; + this.description = "NA"; + } + + private string GenerateHashId(string url, string displayLink) + { + // Use a simple hash code and base36 encoding for lightweight hash + int hash = url.GetHashCode(); + string base36 = Math.Abs(hash).ToString("x"); // Hexadecimal representation + string dtime = DateTime.UtcNow.ToString("yyyyMMdd"); + // Pad or trim to 20 characters + var hashvalue = base36.Length > 10 ? base36.Substring(0, 10) : base36.PadLeft(10, '0'); + return $"{displayLink}-{dtime}-{hashvalue}"; + } + } +} \ No newline at end of file diff --git a/src/Backend/Models/Public/GSResult.cs b/src/Backend/Models/Public/GSResult.cs new file mode 100644 index 0000000..e89800e --- /dev/null +++ b/src/Backend/Models/Public/GSResult.cs @@ -0,0 +1,132 @@ +namespace Backend.Models.Public +{ + public class GSResult + { + public string kind { get; set; } + public UrlInfo url { get; set; } + public Queries queries { get; set; } + public Context context { get; set; } + public SearchInformation searchInformation { get; set; } + public List items { get; set; } + } + + public class UrlInfo + { + public string type { get; set; } + public string template { get; set; } + } + + public class Queries + { + public List request { get; set; } + public List nextPage { get; set; } + } + + public class QueryRequest + { + public string totalResults { get; set; } + public int count { get; set; } + public int startIndex { get; set; } + public string inputEncoding { get; set; } + public string outputEncoding { get; set; } + public string safe { get; set; } + public string cx { get; set; } + public string sort { get; set; } + public string gl { get; set; } + public string siteSearch { get; set; } + public string siteSearchFilter { get; set; } + public string exactTerms { get; set; } + public string excludeTerms { get; set; } + public string dateRestrict { get; set; } + } + + public class Context + { + public string title { get; set; } + } + + public class SearchInformation + { + public double searchTime { get; set; } + public string formattedSearchTime { get; set; } + public string totalResults { get; set; } + public string formattedTotalResults { get; set; } + } + + public class Item + { + public string kind { get; set; } + public string title { get; set; } + public string htmlTitle { get; set; } + public string link { get; set; } + public string displayLink { get; set; } + public string snippet { get; set; } + public string htmlSnippet { get; set; } + public string formattedUrl { get; set; } + public string htmlFormattedUrl { get; set; } + // public PageMap pagemap { get; set; } // Not in use currently + } + + /* + #region PageMapClasses + public class PageMap + { + public List metatags { get; set; } + public List cse_thumbnail { get; set; } + public List cse_image { get; set; } + public List BreadcrumbList { get; set; } + public List organization { get; set; } + } + + public class MetaTag + { + public string image { get; set; } + public string og_type { get; set; } + public string viewport { get; set; } + public string title { get; set; } + public string og_url { get; set; } + public string og_image { get; set; } + public string og_site_name { get; set; } + public string og_locale { get; set; } + public string og_description { get; set; } + public string twitter_card { get; set; } + public string twitter_image { get; set; } + public string author { get; set; } + public string url { get; set; } + public string position { get; set; } + public string referrer { get; set; } + public string csrf_token { get; set; } + public string csrf_param { get; set; } + public string jobidentifier { get; set; } + public string og_image_width { get; set; } + public string og_image_height { get; set; } + public string http_ogp_me_ns_article_published_time { get; set; } + public string http_ogp_me_ns_article_modified_time { get; set; } + public string http_ogp_me_ns_article_section { get; set; } + public string twitter_site { get; set; } + } + + public class CseThumbnail + { + public string src { get; set; } + public string width { get; set; } + public string height { get; set; } + } + + public class CseImage + { + public string src { get; set; } + } + + public class BreadcrumbList + { + // Add properties if needed + } + + public class Organization + { + public string sameas { get; set; } + } + #endregion PageMapClasses + */ +} \ No newline at end of file diff --git a/src/Backend/Operations/AIEngine.cs b/src/Backend/Operations/AIEngine.cs new file mode 100644 index 0000000..526b377 --- /dev/null +++ b/src/Backend/Operations/AIEngine.cs @@ -0,0 +1,16 @@ +namespace Backend.Operations +{ + using Azure.Identity; + using Azure.AI.Inference; + using Azure.Core; + using Azure.Core.Pipeline; + class AIEngine + { + private const string OPENAI_API_URL = "https://job-analyzer.services.ai.azure.com/api/projects/firstProject"; + private readonly ILogger logger; + private readonly IConfiguration configuration; + public AIEngine(IConfiguration configuration, ILogger logger) + { + } + } +} \ No newline at end of file diff --git a/src/Backend/Operations/GSEngine.cs b/src/Backend/Operations/GSEngine.cs new file mode 100644 index 0000000..4a58b4f --- /dev/null +++ b/src/Backend/Operations/GSEngine.cs @@ -0,0 +1,126 @@ +namespace Backend.Operations +{ + using System.Data.Common; + using Backend.Models.Internal; + using Microsoft.Azure.Cosmos.Linq; + using Newtonsoft.Json; + public class GSEngine + { + private readonly string apiKey; + private readonly string searchEngineId; + private readonly HttpClient httpClient; + private string baseUrl = "https://customsearch.googleapis.com/customsearch/v1"; + private int maxResultsPerSearch = 150; + ILogger logger; + + public GSEngine(IConfiguration configuration, ILogger _logger) + { + this.apiKey = configuration["GoogleSearch:ApiKey"] ?? throw new ArgumentNullException("Google Search API Key is not configured."); + this.searchEngineId = configuration["GoogleSearch:SearchEngineId"] ?? throw new ArgumentNullException("Google Search Engine ID is not configured."); + this.logger = _logger; + this.httpClient = new HttpClient(); + } + + public async Task> SearchAndScrapeJobsAsync(string query) + { + var allJobs = new List(); + int startIndex = 1, totalResults = 0; + + var template = $"{this.baseUrl}?key={apiKey}&cx={searchEngineId}&q={Uri.EscapeDataString(query)}"; + template += AddLocationToQuery() + AddDateRestrictionToQuery() + AddNegativeTermToQuery() + + AddPositiveTermToQuery() + RemoveSiteSearchFromQuery() + AddAdditionalSearchTerms(); + + do + { + var url = template + AddStartIndexToQuery(startIndex); + var res = await SearchRawUrlAsync(url); + if (res == null) + { + logger.LogError("SearchAsync returned null result."); + break; + } + else if (res.queries.request[0].count == 0) + { + logger.LogInformation($"No results found for query: {url}"); + break; + } + + foreach (var item in res.items) + { + var job = new ScrappedJob(item, DateTime.UtcNow); + allJobs.Add(job); + } + + totalResults = int.Parse(res.queries.request[0].totalResults); + startIndex += res.queries.request[0].count; + } + while (startIndex < maxResultsPerSearch && startIndex < totalResults); + + this.logger.LogInformation($"Fetched {allJobs.Count} jobs. Total available: {totalResults}. Using url template: {template}"); + + return allJobs; + } + + public async Task SearchRawUrlAsync(string url) + { + try + { + var response = await httpClient.GetAsync(url); + response.EnsureSuccessStatusCode(); + var content = await response.Content.ReadAsStringAsync(); + return JsonConvert.DeserializeObject(content); + } + catch (Exception ex) + { + logger.LogError(ex, "Error occurred during Google Search API call."); + } + + return null; + } + + private string AddLocationToQuery(string location = "in") + { + return $"&gl={location}"; + } + + private string AddDateRestrictionToQuery(string dateRestrict = "d1") + { + return $"&dateRestrict={dateRestrict}"; + } + + private string AddNegativeTermToQuery(string phrase = "manager") + { + return $"&excludeTerms={Uri.EscapeDataString(phrase)}"; + } + + private string AddPositiveTermToQuery(string phrase = "Software Engineer") + { + return $"&exactTerms={Uri.EscapeDataString(phrase)}"; + } + + private string AddSiteSearchToQuery(string site = "linkedin.com") + { + return $"&siteSearch={site}&siteSearchFilter=i"; + } + + private string RemoveSiteSearchFromQuery(string site = "linkedin.com") + { + return $"&siteSearch={site}&siteSearchFilter=e"; + } + + private string AddSortingToQuery(string sort = "date") + { + return $"&sort={sort}"; + } + + private string AddAdditionalSearchTerms(string terms = "India") + { + return $"&hq={Uri.EscapeDataString(terms)}"; + } + + private string AddStartIndexToQuery(int startIndex = 1) + { + return $"&start={startIndex}"; + } + } +} \ No newline at end of file diff --git a/src/Backend/Program.cs b/src/Backend/Program.cs index 7583545..17b9aff 100644 --- a/src/Backend/Program.cs +++ b/src/Backend/Program.cs @@ -43,7 +43,7 @@ public static void Main(string[] args) { options.AddPolicy("AllowReactApp", builder => builder - .WithOrigins("http://localhost:3000") + .AllowAnyOrigin() .AllowAnyMethod() .AllowAnyHeader()); }); @@ -62,13 +62,10 @@ public static void Main(string[] args) // Register AppContext as singleton var config = builder.Configuration; var cosmosClient = new CosmosClient(config["ApplicationSettings:CosmosDbUri"], config["ApplicationSettings:CosmosDbPrimaryKey"]); - builder.Services.AddSingleton(s => - new AppContext( - cosmosClient, - builder.Configuration, - s.GetRequiredService>() - ) - ); + builder.Services.AddSingleton(cosmosClient); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); + builder.Services.AddSingleton(); var app = builder.Build(); ILogger logger = app.Logger; diff --git a/src/Backend/Properties/launchSettings.json b/src/Backend/Properties/launchSettings.json index fe3de06..8b6a653 100644 --- a/src/Backend/Properties/launchSettings.json +++ b/src/Backend/Properties/launchSettings.json @@ -29,6 +29,16 @@ "ASPNETCORE_ENVIRONMENT": "Development" } }, + "container": { + "commandName": "Project", + "dotnetRunMessages": true, + "launchBrowser": true, + "launchUrl": "swagger", + "applicationUrl": "http://0.0.0.0:5164", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + } + }, "IIS Express": { "commandName": "IISExpress", "launchBrowser": true, diff --git a/src/Backend/appsettings.json b/src/Backend/appsettings.json index 9ee91c9..627b327 100644 --- a/src/Backend/appsettings.json +++ b/src/Backend/appsettings.json @@ -20,5 +20,9 @@ }, "ConnectionString": "" }, + "GoogleSearch": { + "ApiKey": "", + "SearchEngineId": "e509d21f7c4af4d2c" + }, "AllowedHosts": "*" } From 07c38bb64c5bc621832616fa56542dd5d09a9baf Mon Sep 17 00:00:00 2001 From: devidnyk Date: Sun, 12 Oct 2025 02:55:09 +0530 Subject: [PATCH 2/7] Added AI Engine to label job category --- src/Backend/AppContext.cs | 4 +- src/Backend/Backend.csproj | 2 + .../Controllers/JobSearchController.cs | 12 +- src/Backend/Operations/AIEngine.cs | 123 +++++++++++++++++- src/Backend/Operations/GSEngine.cs | 2 +- src/Backend/Program.cs | 1 + 6 files changed, 136 insertions(+), 8 deletions(-) diff --git a/src/Backend/AppContext.cs b/src/Backend/AppContext.cs index 5950797..6f3ddfa 100644 --- a/src/Backend/AppContext.cs +++ b/src/Backend/AppContext.cs @@ -9,13 +9,15 @@ public class AppContext public readonly IConfiguration configuration; public readonly ILogger logger; public readonly GSEngine gsEngine; + public readonly AIEngine aiEngine; - public AppContext(DataProvider _dataProvider, GSEngine _gsEngine, IConfiguration configuration, ILogger logger) + public AppContext(DataProvider _dataProvider, GSEngine _gsEngine, AIEngine _aiEngine, IConfiguration configuration, ILogger logger) { this.dataProvider = _dataProvider; this.gsEngine = _gsEngine; this.configuration = configuration; this.logger = logger; + this.aiEngine = _aiEngine; } } } \ No newline at end of file diff --git a/src/Backend/Backend.csproj b/src/Backend/Backend.csproj index 768ca68..ad94ee8 100644 --- a/src/Backend/Backend.csproj +++ b/src/Backend/Backend.csproj @@ -7,7 +7,9 @@ + + diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs index 8d1a32e..c38a0de 100644 --- a/src/Backend/Controllers/JobSearchController.cs +++ b/src/Backend/Controllers/JobSearchController.cs @@ -3,6 +3,7 @@ namespace Backend.Controllers using Microsoft.AspNetCore.Mvc; [ApiController] + [Route("api")] public class JobSearchController : ControllerBase { private readonly AppContext appContext; @@ -19,13 +20,22 @@ public JobSearchController(AppContext appContext) var result = await gsEngine.SearchAndScrapeJobsAsync(query); if (result != null) { + var levels = await this.appContext.aiEngine.GetJobLevelAsync(result); + foreach (var level in levels) + { + var job = result.FirstOrDefault(j => j.jobId == level.Key); + if (job != null) + { + job.tags.Add(level.Value); + } + } return Ok(result); } return StatusCode(500, "Error occurred while searching for jobs."); } [HttpGet] - [Route("/jobs/latest")] + [Route("/jobs")] public ActionResult GetLatestJobs() { // Placeholder implementation for latest jobs diff --git a/src/Backend/Operations/AIEngine.cs b/src/Backend/Operations/AIEngine.cs index 526b377..538c783 100644 --- a/src/Backend/Operations/AIEngine.cs +++ b/src/Backend/Operations/AIEngine.cs @@ -1,16 +1,129 @@ namespace Backend.Operations { + using Azure; + using Azure.AI; using Azure.Identity; using Azure.AI.Inference; - using Azure.Core; - using Azure.Core.Pipeline; - class AIEngine + using Azure.AI.Projects; + using Azure.AI.Agents.Persistent; + using System.Diagnostics; + using Backend.Models.Internal; + using Newtonsoft.Json; + + public class AIEngine { - private const string OPENAI_API_URL = "https://job-analyzer.services.ai.azure.com/api/projects/firstProject"; + private const string AI_SERVICE_ENDPOINT = "https://job-analyzer.services.ai.azure.com/api/projects/firstProject"; + private const string AGENT_ID = "asst_gWZPhAs5gg4jVvmuto9sop5h"; private readonly ILogger logger; private readonly IConfiguration configuration; + private PersistentAgent agent; + private PersistentAgentsClient agentsClient; public AIEngine(IConfiguration configuration, ILogger logger) - { + { + this.logger = logger; + this.configuration = configuration; + for (int i = 0; i < 3; i++) + { + try + { + this.agentsClient = new(AI_SERVICE_ENDPOINT, new DefaultAzureCredential()); + this.agent = this.agentsClient.Administration.GetAgent(AGENT_ID); + this.logger.LogInformation($"AIEngine initialized successfully. Endpoint: {AI_SERVICE_ENDPOINT}, AgentId: {AGENT_ID}"); + break; + } + catch (Exception ex) + { + logger.LogError($"Error initializing AIEngine: {ex.Message}"); + Task.Delay((i + 1) * 2000).ConfigureAwait(false).GetAwaiter().GetResult(); + } + } + + if (!IsReady()) + { + this.logger.LogError("AIEngine failed to initialize properly."); + throw new InvalidOperationException("AIEngine failed to initialize properly."); + } + } + + public bool IsReady() + { + return this.agent != null && this.agentsClient != null; + } + + public async Task>> GetJobLevelAsync(List scrappedJobs) + { + var results = new List>(); + this.logger.LogInformation($"Processing {scrappedJobs.Count} scrapped jobs. Ready: {IsReady()}"); + for (int i=0; i < scrappedJobs.Count; i += 20) + { + var batch = scrappedJobs.Skip(i).Take(20).ToList(); + try + { + var sw = Stopwatch.StartNew(); + var prompt = JsonConvert.SerializeObject(batch); + var response = await GetResponseInternalAsync(prompt); + sw.Stop(); + this.logger.LogInformation($"Processed jobs: {string.Join(",", batch.Select(j => j.jobId))} | response: {response}"); + var kvList = response.Split(",").Select(kvs => kvs.Split(":")).Where(kv => kv.Length == 2).Select(kv => new KeyValuePair(kv[0].Trim(), kv[1].Trim())).ToList(); + results.AddRange(kvList); + } + catch (Exception ex) + { + this.logger.LogError($"Error processing batch: {string.Join(",", batch.Select(j => j.jobId))} | {ex.Message}"); + } + } + return results; + } + + private async Task GetResponseInternalAsync(string input) + { + if (!IsReady()) + { + this.logger.LogError($"AIEngine is not properly initialized. Given input: {input}"); + throw new InvalidOperationException("AIEngine is not properly initialized."); + } + + PersistentAgentThread thread = agentsClient.Threads.CreateThread(); + + PersistentThreadMessage messageResponse = agentsClient.Messages.CreateMessage( + thread.Id, + MessageRole.User, + input); + + ThreadRun run = agentsClient.Runs.CreateRun( + thread.Id, + agent.Id); + + // Poll until the run reaches a terminal status + do + { + await Task.Delay(TimeSpan.FromMilliseconds(500)); + run = agentsClient.Runs.GetRun(thread.Id, run.Id); + } + while (run.Status == RunStatus.Queued + || run.Status == RunStatus.InProgress); + if (run.Status != RunStatus.Completed) + { + this.logger.LogError($"Run failed or was canceled. ThreadId: {thread.Id} Last error: {run.LastError?.Message}"); + throw new InvalidOperationException($"Run failed or was canceled: {run.LastError?.Message}"); + } + + Pageable messages = agentsClient.Messages.GetMessages( + thread.Id, order: ListSortOrder.Ascending); + + string response = string.Empty; + PersistentThreadMessage lastThreadMessage = messages.Last(); + + foreach (MessageContent contentItem in lastThreadMessage.ContentItems) + { + if (contentItem is MessageTextContent textItem) + { + response += textItem.Text; + } + } + + agentsClient.Threads.DeleteThread(thread.Id); + return response; } } } \ No newline at end of file diff --git a/src/Backend/Operations/GSEngine.cs b/src/Backend/Operations/GSEngine.cs index 4a58b4f..68c4d4b 100644 --- a/src/Backend/Operations/GSEngine.cs +++ b/src/Backend/Operations/GSEngine.cs @@ -39,7 +39,7 @@ public async Task> SearchAndScrapeJobsAsync(string query) logger.LogError("SearchAsync returned null result."); break; } - else if (res.queries.request[0].count == 0) + else if (string.IsNullOrEmpty(res.queries.request[0].totalResults) || res.items == null) { logger.LogInformation($"No results found for query: {url}"); break; diff --git a/src/Backend/Program.cs b/src/Backend/Program.cs index 17b9aff..67a1a28 100644 --- a/src/Backend/Program.cs +++ b/src/Backend/Program.cs @@ -65,6 +65,7 @@ public static void Main(string[] args) builder.Services.AddSingleton(cosmosClient); builder.Services.AddSingleton(); builder.Services.AddSingleton(); + builder.Services.AddSingleton(); builder.Services.AddSingleton(); var app = builder.Build(); From f5a811a1492dee8164f93656a9fdf0a3e83b08bc Mon Sep 17 00:00:00 2001 From: devidnyk Date: Sun, 12 Oct 2025 12:50:05 +0530 Subject: [PATCH 3/7] search query on previous days --- src/Backend/Controllers/JobSearchController.cs | 6 ++++-- src/Backend/Operations/GSEngine.cs | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs index c38a0de..707a73c 100644 --- a/src/Backend/Controllers/JobSearchController.cs +++ b/src/Backend/Controllers/JobSearchController.cs @@ -14,10 +14,12 @@ public JobSearchController(AppContext appContext) [HttpGet] [Route("/jobs/search")] - public async Task>> SearchJobs([FromQuery(Name = "q")] string query) + public async Task>> SearchJobs( + [FromQuery(Name = "q")] string query, + [FromQuery(Name = "d")] int nPreviousDays) { var gsEngine = this.appContext.gsEngine; - var result = await gsEngine.SearchAndScrapeJobsAsync(query); + var result = await gsEngine.SearchAndScrapeJobsAsync(query, nPreviousDays); if (result != null) { var levels = await this.appContext.aiEngine.GetJobLevelAsync(result); diff --git a/src/Backend/Operations/GSEngine.cs b/src/Backend/Operations/GSEngine.cs index 68c4d4b..1bb28e9 100644 --- a/src/Backend/Operations/GSEngine.cs +++ b/src/Backend/Operations/GSEngine.cs @@ -21,14 +21,14 @@ public GSEngine(IConfiguration configuration, ILogger _logger) this.httpClient = new HttpClient(); } - public async Task> SearchAndScrapeJobsAsync(string query) + public async Task> SearchAndScrapeJobsAsync(string query, int nPreviousDays = 1) { var allJobs = new List(); int startIndex = 1, totalResults = 0; var template = $"{this.baseUrl}?key={apiKey}&cx={searchEngineId}&q={Uri.EscapeDataString(query)}"; - template += AddLocationToQuery() + AddDateRestrictionToQuery() + AddNegativeTermToQuery() + - AddPositiveTermToQuery() + RemoveSiteSearchFromQuery() + AddAdditionalSearchTerms(); + template += AddLocationToQuery() + AddDateRestrictionToQuery($"d{nPreviousDays}") + AddNegativeTermToQuery() + + AddPositiveTermToQuery() + AddAdditionalSearchTerms(); // + RemoveSiteSearchFromQuery() do { From dd1427d6450066a558dac4b2e707aa26442965be Mon Sep 17 00:00:00 2001 From: devidnyk Date: Sat, 18 Oct 2025 01:45:49 +0530 Subject: [PATCH 4/7] merged master and resolved conflicts --- src/Backend/AppContext.cs | 23 ------------------- .../Controllers/JobSearchController.cs | 19 +++++++++------ src/Backend/Controllers/ProblemsController.cs | 2 +- src/Backend/Operations/AIEngine.cs | 2 +- src/Backend/Operations/GSEngine.cs | 8 +++---- src/Backend/Program.cs | 3 +++ src/Common/Models/GSResult.cs | 2 +- src/Common/Models/{ => Public}/QueryParam.cs | 2 +- .../Internal => Common/Models}/ScrappedJob.cs | 4 ++-- 9 files changed, 24 insertions(+), 41 deletions(-) delete mode 100644 src/Backend/AppContext.cs rename src/Common/Models/{ => Public}/QueryParam.cs (89%) rename src/{Backend/Models/Internal => Common/Models}/ScrappedJob.cs (92%) diff --git a/src/Backend/AppContext.cs b/src/Backend/AppContext.cs deleted file mode 100644 index 6f3ddfa..0000000 --- a/src/Backend/AppContext.cs +++ /dev/null @@ -1,23 +0,0 @@ -using Backend.Operations; -using Microsoft.Azure.Cosmos; - -namespace Backend -{ - public class AppContext - { - public readonly DataProvider dataProvider; - public readonly IConfiguration configuration; - public readonly ILogger logger; - public readonly GSEngine gsEngine; - public readonly AIEngine aiEngine; - - public AppContext(DataProvider _dataProvider, GSEngine _gsEngine, AIEngine _aiEngine, IConfiguration configuration, ILogger logger) - { - this.dataProvider = _dataProvider; - this.gsEngine = _gsEngine; - this.configuration = configuration; - this.logger = logger; - this.aiEngine = _aiEngine; - } - } -} \ No newline at end of file diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs index 707a73c..e73d3f9 100644 --- a/src/Backend/Controllers/JobSearchController.cs +++ b/src/Backend/Controllers/JobSearchController.cs @@ -1,28 +1,33 @@ namespace Backend.Controllers { + using Backend.Operations; using Microsoft.AspNetCore.Mvc; + using Common.Models; [ApiController] [Route("api")] public class JobSearchController : ControllerBase { - private readonly AppContext appContext; - public JobSearchController(AppContext appContext) + private readonly GSEngine gsEngine; + private readonly AIEngine aiEngine; + private readonly ILogger logger; + public JobSearchController(GSEngine gsEngine, AIEngine aiEngine, ILogger logger) { - this.appContext = appContext; + this.gsEngine = gsEngine; + this.aiEngine = aiEngine; + this.logger = logger; } [HttpGet] [Route("/jobs/search")] - public async Task>> SearchJobs( + public async Task>> SearchJobs( [FromQuery(Name = "q")] string query, [FromQuery(Name = "d")] int nPreviousDays) { - var gsEngine = this.appContext.gsEngine; - var result = await gsEngine.SearchAndScrapeJobsAsync(query, nPreviousDays); + var result = await this.gsEngine.SearchAndScrapeJobsAsync(query, nPreviousDays); if (result != null) { - var levels = await this.appContext.aiEngine.GetJobLevelAsync(result); + var levels = await this.aiEngine.GetJobLevelAsync(result); foreach (var level in levels) { var job = result.FirstOrDefault(j => j.jobId == level.Key); diff --git a/src/Backend/Controllers/ProblemsController.cs b/src/Backend/Controllers/ProblemsController.cs index 7be6d7c..fd63bf0 100644 --- a/src/Backend/Controllers/ProblemsController.cs +++ b/src/Backend/Controllers/ProblemsController.cs @@ -1,7 +1,7 @@ namespace Backend.Controllers { using Backend.Filters; - using Backend.Models.Public; + using Common.Models.Public; using Backend.Operations; using Common.Models; using Microsoft.AspNetCore.Mvc; diff --git a/src/Backend/Operations/AIEngine.cs b/src/Backend/Operations/AIEngine.cs index 538c783..dfcb015 100644 --- a/src/Backend/Operations/AIEngine.cs +++ b/src/Backend/Operations/AIEngine.cs @@ -7,7 +7,7 @@ namespace Backend.Operations using Azure.AI.Projects; using Azure.AI.Agents.Persistent; using System.Diagnostics; - using Backend.Models.Internal; + using Common.Models; using Newtonsoft.Json; public class AIEngine diff --git a/src/Backend/Operations/GSEngine.cs b/src/Backend/Operations/GSEngine.cs index 1bb28e9..5acc8d1 100644 --- a/src/Backend/Operations/GSEngine.cs +++ b/src/Backend/Operations/GSEngine.cs @@ -1,8 +1,6 @@ namespace Backend.Operations { - using System.Data.Common; - using Backend.Models.Internal; - using Microsoft.Azure.Cosmos.Linq; + using Common.Models; using Newtonsoft.Json; public class GSEngine { @@ -61,14 +59,14 @@ public async Task> SearchAndScrapeJobsAsync(string query, int return allJobs; } - public async Task SearchRawUrlAsync(string url) + public async Task SearchRawUrlAsync(string url) { try { var response = await httpClient.GetAsync(url); response.EnsureSuccessStatusCode(); var content = await response.Content.ReadAsStringAsync(); - return JsonConvert.DeserializeObject(content); + return JsonConvert.DeserializeObject(content); } catch (Exception ex) { diff --git a/src/Backend/Program.cs b/src/Backend/Program.cs index 3c19749..43c7f8b 100644 --- a/src/Backend/Program.cs +++ b/src/Backend/Program.cs @@ -94,6 +94,9 @@ public static void Main(string[] args) services.AddTransient(); #endregion + services.AddSingleton(); + services.AddSingleton(); + var app = builder.Build(); ILogger logger = app.Logger; diff --git a/src/Common/Models/GSResult.cs b/src/Common/Models/GSResult.cs index e89800e..c1882c0 100644 --- a/src/Common/Models/GSResult.cs +++ b/src/Common/Models/GSResult.cs @@ -1,4 +1,4 @@ -namespace Backend.Models.Public +namespace Common.Models { public class GSResult { diff --git a/src/Common/Models/QueryParam.cs b/src/Common/Models/Public/QueryParam.cs similarity index 89% rename from src/Common/Models/QueryParam.cs rename to src/Common/Models/Public/QueryParam.cs index 340cc74..85010b8 100644 --- a/src/Common/Models/QueryParam.cs +++ b/src/Common/Models/Public/QueryParam.cs @@ -1,4 +1,4 @@ -namespace Backend.Models.Public +namespace Common.Models.Public { public class QueryParam { diff --git a/src/Backend/Models/Internal/ScrappedJob.cs b/src/Common/Models/ScrappedJob.cs similarity index 92% rename from src/Backend/Models/Internal/ScrappedJob.cs rename to src/Common/Models/ScrappedJob.cs index 9759d83..21caa68 100644 --- a/src/Backend/Models/Internal/ScrappedJob.cs +++ b/src/Common/Models/ScrappedJob.cs @@ -1,4 +1,4 @@ -namespace Backend.Models.Internal +namespace Common.Models { public class ScrappedJob { @@ -12,7 +12,7 @@ public class ScrappedJob public List tags { get; set; } = new List(); public ScrappedJob() { } - public ScrappedJob(Models.Public.Item item, DateTime scrappedTime) + public ScrappedJob(Item item, DateTime scrappedTime) { this.title = item.title; this.displayLink = item.displayLink; From 07581e4d80d9edf005a69d09b4c4e045f9d54db8 Mon Sep 17 00:00:00 2001 From: devidnyk Date: Sat, 18 Oct 2025 17:58:41 +0530 Subject: [PATCH 5/7] Add scrapper configurations and isolated scrapping process --- .../Controllers/JobSearchController.cs | 66 ++++++++++++--- src/Backend/Operations/AIEngine.cs | 4 +- src/Backend/Operations/GSEngine.cs | 47 ++++++++--- src/Backend/Operations/JobScrapper.cs | 64 ++++++++++++++ src/Backend/Operations/JobScrapperManager.cs | 53 ++++++++++++ .../Operations/JobScrapperSettingsManager.cs | 54 ++++++++++++ src/Backend/Program.cs | 3 + src/Common/Enums/CosmosContainerEnum.cs | 4 +- .../Factories/CosmosContainerFactory.cs | 16 +++- src/Common/Helper.cs | 51 ++++++++++++ src/Common/Models/JobScrapperSettings.cs | 83 +++++++++++++++++++ src/Common/Models/Public/ScrapperSettings.cs | 23 +++++ src/Common/Models/ScrappedJob.cs | 14 +--- src/Common/Repositories/JobsRepository.cs | 83 +++++++++++++++++++ 14 files changed, 525 insertions(+), 40 deletions(-) create mode 100644 src/Backend/Operations/JobScrapper.cs create mode 100644 src/Backend/Operations/JobScrapperManager.cs create mode 100644 src/Backend/Operations/JobScrapperSettingsManager.cs create mode 100644 src/Common/Helper.cs create mode 100644 src/Common/Models/JobScrapperSettings.cs create mode 100644 src/Common/Models/Public/ScrapperSettings.cs create mode 100644 src/Common/Repositories/JobsRepository.cs diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs index e73d3f9..47e329d 100644 --- a/src/Backend/Controllers/JobSearchController.cs +++ b/src/Backend/Controllers/JobSearchController.cs @@ -3,6 +3,9 @@ namespace Backend.Controllers using Backend.Operations; using Microsoft.AspNetCore.Mvc; using Common.Models; + using Common.Models.Public; + using Common.Repositories; + using System.Threading.Tasks; [ApiController] [Route("api")] @@ -10,27 +13,31 @@ public class JobSearchController : ControllerBase { private readonly GSEngine gsEngine; private readonly AIEngine aiEngine; + private readonly JobScrapperManager jobscrapperManager; + private readonly JobsRepository jobsContainer; private readonly ILogger logger; - public JobSearchController(GSEngine gsEngine, AIEngine aiEngine, ILogger logger) + public JobSearchController(GSEngine gsEngine, AIEngine aiEngine, JobsRepository jobsContainer, JobScrapperManager jobscrapperManager, ILogger logger) { this.gsEngine = gsEngine; this.aiEngine = aiEngine; this.logger = logger; + this.jobscrapperManager = jobscrapperManager; + this.jobsContainer = jobsContainer; } [HttpGet] - [Route("/jobs/search")] + [Route("jobs/search")] public async Task>> SearchJobs( [FromQuery(Name = "q")] string query, [FromQuery(Name = "d")] int nPreviousDays) { - var result = await this.gsEngine.SearchAndScrapeJobsAsync(query, nPreviousDays); + var result = await this.gsEngine.SearchBasicQueryAsync(query, nPreviousDays); if (result != null) { var levels = await this.aiEngine.GetJobLevelAsync(result); foreach (var level in levels) { - var job = result.FirstOrDefault(j => j.jobId == level.Key); + var job = result.FirstOrDefault(j => j.id == level.Key); if (job != null) { job.tags.Add(level.Value); @@ -42,20 +49,55 @@ public async Task>> SearchJobs( } [HttpGet] - [Route("/jobs")] - public ActionResult GetLatestJobs() + [Route("jobs/latest")] + public async Task> GetLatestJobsFromScrapper() { - // Placeholder implementation for latest jobs - return Ok("Latest job postings"); + return Ok(await this.jobsContainer.GetAllLatestJobsAsync()); } [HttpGet] - [Route("/jobs/{id}")] - public ActionResult GetJobById(string id) + [Route("jobs/profile/{id}")] + public async Task> GetJobById(string id) { - // Placeholder implementation for getting job by ID - return Ok($"Job details for ID: {id}"); + var job = await this.jobsContainer.GetJobByIdAsync(id); + if (job != null) + { + return Ok(job); + } + return Ok("Not found."); + } + + [HttpGet] + [Route("jobs/scrappers")] + public ActionResult> GetAllJobScrappers() + { + // Placeholder implementation for getting all scrappers + return Ok(this.jobscrapperManager.settingsManager.GetAllSettings()); + } + + [HttpPut] + [Route("jobs/scrappers/{id}")] + public ActionResult CreateOrUpdateJobScrapperSettings(string id, [FromBody] ScrapperSettings settings) + { + // Placeholder implementation for updating scrapper settings + return Ok(this.jobscrapperManager.settingsManager.CreateOrUpdateSettings(id, settings)); + } + + [HttpGet] + [Route("jobs/scrappers/{id}")] + public ActionResult GetJobScrapperSettings(string id) + { + // Placeholder implementation for getting scrapper settings + return Ok(this.jobscrapperManager.settingsManager.GetSettingsById(id)); } + [HttpGet] + [Route("jobs/scrappers/{id}/trigger")] + public ActionResult TriggerScrapper(string id) + { + // Placeholder implementation for getting scrapper settings + this.jobscrapperManager.RunScrapperByIdAsync(id); + return Ok($"Started scrapper for settings id: {id}"); + } } } \ No newline at end of file diff --git a/src/Backend/Operations/AIEngine.cs b/src/Backend/Operations/AIEngine.cs index dfcb015..631336c 100644 --- a/src/Backend/Operations/AIEngine.cs +++ b/src/Backend/Operations/AIEngine.cs @@ -63,13 +63,13 @@ public async Task>> GetJobLevelAsync(List j.jobId))} | response: {response}"); + this.logger.LogInformation($"Processed jobs: {string.Join(",", batch.Select(j => j.id))} | response: {response}"); var kvList = response.Split(",").Select(kvs => kvs.Split(":")).Where(kv => kv.Length == 2).Select(kv => new KeyValuePair(kv[0].Trim(), kv[1].Trim())).ToList(); results.AddRange(kvList); } catch (Exception ex) { - this.logger.LogError($"Error processing batch: {string.Join(",", batch.Select(j => j.jobId))} | {ex.Message}"); + this.logger.LogError($"Error processing batch: {string.Join(",", batch.Select(j => j.id))} | {ex.Message}"); } } return results; diff --git a/src/Backend/Operations/GSEngine.cs b/src/Backend/Operations/GSEngine.cs index 5acc8d1..5d59743 100644 --- a/src/Backend/Operations/GSEngine.cs +++ b/src/Backend/Operations/GSEngine.cs @@ -1,6 +1,8 @@ namespace Backend.Operations { using Common.Models; + using Common.Models.Public; + using Microsoft.AspNetCore.Mvc.ModelBinding; using Newtonsoft.Json; public class GSEngine { @@ -19,14 +21,21 @@ public GSEngine(IConfiguration configuration, ILogger _logger) this.httpClient = new HttpClient(); } - public async Task> SearchAndScrapeJobsAsync(string query, int nPreviousDays = 1) + public async Task> SearchQueryAsync(JobScrapperSettings settings) { + var qsettings = settings.Settings; var allJobs = new List(); int startIndex = 1, totalResults = 0; - var template = $"{this.baseUrl}?key={apiKey}&cx={searchEngineId}&q={Uri.EscapeDataString(query)}"; - template += AddLocationToQuery() + AddDateRestrictionToQuery($"d{nPreviousDays}") + AddNegativeTermToQuery() + - AddPositiveTermToQuery() + AddAdditionalSearchTerms(); // + RemoveSiteSearchFromQuery() + var template = $"{this.baseUrl}?key={apiKey}&cx={searchEngineId}&q={Uri.EscapeDataString(qsettings.Query)}"; + template += AddDateRestrictionToQuery(qsettings.lookBackDays); + + if (!string.IsNullOrEmpty(qsettings.ExactTerms)) template += AddExactTermsToQuery(qsettings.ExactTerms); + if (!string.IsNullOrEmpty(qsettings.NegativeTerms)) template += AddNegativeTermToQuery(qsettings.NegativeTerms); + if (!string.IsNullOrEmpty(qsettings.Location)) template += AddClientLocationToQuery(qsettings.Location); + if (!string.IsNullOrEmpty(qsettings.SiteToInclude)) template += AddSiteSearchToQuery(qsettings.SiteToExclude); + if (!string.IsNullOrEmpty(qsettings.SiteToExclude)) template += AddExcludeSiteSearchFromQuery(qsettings.SiteToExclude); + if (!string.IsNullOrEmpty(qsettings.AdditionalSearchterms)) template += AddAdditionalSearchTerms(qsettings.AdditionalSearchterms); do { @@ -42,7 +51,7 @@ public async Task> SearchAndScrapeJobsAsync(string query, int logger.LogInformation($"No results found for query: {url}"); break; } - + foreach (var item in res.items) { var job = new ScrappedJob(item, DateTime.UtcNow); @@ -50,7 +59,7 @@ public async Task> SearchAndScrapeJobsAsync(string query, int } totalResults = int.Parse(res.queries.request[0].totalResults); - startIndex += res.queries.request[0].count; + startIndex += res.queries.request[0].count; } while (startIndex < maxResultsPerSearch && startIndex < totalResults); @@ -59,6 +68,22 @@ public async Task> SearchAndScrapeJobsAsync(string query, int return allJobs; } + public async Task> SearchBasicQueryAsync(string query, int nPreviousDays = 1) + { + var qsettings = new Common.Models.Public.QuerySettings + { + query = query, + additionalTerms = "India", + exactTerms = "Software Engineer", + negativeTerms = "Manager", + location = "India", + siteToExclude = "linkedin.com" + }; + var settings = new JobScrapperSettings("basic-search", qsettings, true); + settings.Settings.lookBackDays = nPreviousDays; + return await SearchQueryAsync(settings); + } + public async Task SearchRawUrlAsync(string url) { try @@ -76,14 +101,14 @@ public async Task> SearchAndScrapeJobsAsync(string query, int return null; } - private string AddLocationToQuery(string location = "in") + private string AddClientLocationToQuery(string location = "in") { return $"&gl={location}"; } - private string AddDateRestrictionToQuery(string dateRestrict = "d1") + private string AddDateRestrictionToQuery(int previousNDays = 1) { - return $"&dateRestrict={dateRestrict}"; + return $"&dateRestrict=d{previousNDays}"; } private string AddNegativeTermToQuery(string phrase = "manager") @@ -91,7 +116,7 @@ private string AddNegativeTermToQuery(string phrase = "manager") return $"&excludeTerms={Uri.EscapeDataString(phrase)}"; } - private string AddPositiveTermToQuery(string phrase = "Software Engineer") + private string AddExactTermsToQuery(string phrase = "Software Engineer") { return $"&exactTerms={Uri.EscapeDataString(phrase)}"; } @@ -101,7 +126,7 @@ private string AddSiteSearchToQuery(string site = "linkedin.com") return $"&siteSearch={site}&siteSearchFilter=i"; } - private string RemoveSiteSearchFromQuery(string site = "linkedin.com") + private string AddExcludeSiteSearchFromQuery(string site = "linkedin.com") { return $"&siteSearch={site}&siteSearchFilter=e"; } diff --git a/src/Backend/Operations/JobScrapper.cs b/src/Backend/Operations/JobScrapper.cs new file mode 100644 index 0000000..349d9e7 --- /dev/null +++ b/src/Backend/Operations/JobScrapper.cs @@ -0,0 +1,64 @@ +namespace Backend.Operations +{ + using Common.Models; + using Common.Repositories; + + public class JobScrapper + { + private JobScrapperSettings settings; + private GSEngine gsEngine; + private AIEngine aiEngine; + private JobsRepository jobsContainer; + private ILogger logger; + + public JobScrapper(JobScrapperSettings settings, GSEngine gsEngine, AIEngine aiEngine, JobsRepository jobsRepo, ILogger logger) + { + this.logger = logger; + this.gsEngine = gsEngine; + this.aiEngine = aiEngine; + this.settings = settings; + this.jobsContainer = jobsRepo; + } + + public async Task RunAsync() + { + var startTime = DateTime.UtcNow; + this.logger.LogInformation($"Starting JobScrapper run for settings: {this.settings}"); + + var searchResults = await gsEngine.SearchQueryAsync(this.settings); + + if (searchResults == null || searchResults.Count == 0) + { + this.logger.LogInformation($"Nothing to process. Query settings: {this.settings}"); + return; + } + + var mp = searchResults.ToDictionary(j => j.id, j => j); + var levels = await this.aiEngine.GetJobLevelAsync(searchResults); + foreach (var level in levels) + { + if (mp.ContainsKey(level.Key)) + { + mp[level.Key].tags.Add(level.Value); + } + else + { + this.logger.LogWarning($"Job ID {level.Key} not found in search results while assigning level tag."); + } + } + + foreach (var job in searchResults) + { + var success = await this.jobsContainer.CreateOrUpdateJobAsync(job); + if (!success) + { + this.logger.LogError($"Failed to push job {job.id} to JobsRepository."); + } + } + + var duration = DateTime.UtcNow - startTime; + this.logger.LogInformation($"JobScrapper run completed. Duration: {duration}. Processed {searchResults.Count} jobs for settings: {this.settings}"); + } + + } +} \ No newline at end of file diff --git a/src/Backend/Operations/JobScrapperManager.cs b/src/Backend/Operations/JobScrapperManager.cs new file mode 100644 index 0000000..432e148 --- /dev/null +++ b/src/Backend/Operations/JobScrapperManager.cs @@ -0,0 +1,53 @@ +using Common.Repositories; + +namespace Backend.Operations +{ + public class JobScrapperManager + { + private readonly ILogger logger; + private readonly GSEngine gsEngine; + private readonly AIEngine aiEngine; + private readonly JobsRepository jobsContainer; + public readonly JobScrapperSettingsManager settingsManager; + + + public JobScrapperManager(ILogger logger, GSEngine gsEngine, AIEngine aiEngine, JobScrapperSettingsManager settingsManager, JobsRepository jobsRepo) + { + this.logger = logger; + this.gsEngine = gsEngine; + this.aiEngine = aiEngine; + this.settingsManager = settingsManager; + this.jobsContainer = jobsRepo; + } + + public async Task RunAllScrappersAsync() + { + + } + + public async Task RunScrapperByIdAsync(string id) + { + var settings = this.settingsManager.GetSettingsById(id); + if (settings.Enabled) + { + var scrapper = new JobScrapper(settings, this.gsEngine, this.aiEngine, this.jobsContainer, this.logger); + Task.Run(async () => + { + try + { + await scrapper.RunAsync(); + } + catch (Exception ex) + { + this.logger.LogError($"Error occurred while running scrapper with ID {id}: {ex.Message}"); + } + }); + this.settingsManager.UpdateLastRunTime(id, DateTime.UtcNow); + } + else + { + this.logger.LogWarning($"Scrapper with ID {id} is disabled. Skipping execution."); + } + } + } +} \ No newline at end of file diff --git a/src/Backend/Operations/JobScrapperSettingsManager.cs b/src/Backend/Operations/JobScrapperSettingsManager.cs new file mode 100644 index 0000000..4771902 --- /dev/null +++ b/src/Backend/Operations/JobScrapperSettingsManager.cs @@ -0,0 +1,54 @@ +namespace Backend.Operations +{ + using System.Collections.Concurrent; + using System.Globalization; + using System.Reflection.Metadata.Ecma335; + using Common.Models; + + public class JobScrapperSettingsManager + { + private ConcurrentDictionary settingsStore = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + public JobScrapperSettingsManager() {} + + public JobScrapperSettings CreateOrUpdateSettings(string id, Common.Models.Public.ScrapperSettings publicSettings) + { + var newSettings = new JobScrapperSettings( + id, + publicSettings.settings, + false); // Initially disabled + + settingsStore.AddOrUpdate(id, newSettings, (key, value) => + { + value.UpdateFromPublicModel(publicSettings); + value.LastUpdated = DateTime.UtcNow; + return value; + }); + + return settingsStore[id]; + } + + public JobScrapperSettings GetSettingsById(string id) + { + if(settingsStore.TryGetValue(id, out var settings)) + { + return settings; + } + return new JobScrapperSettings("NOT FOUND", new Common.Models.Public.QuerySettings(), false); + } + + public List GetAllSettings() + { + return settingsStore.Values.ToList(); + } + + public void UpdateLastRunTime(string id, DateTime runTime) + { + if(settingsStore.TryGetValue(id, out var settings)) + { + settings.LastRunTime = runTime; + settingsStore[id] = settings; + } + } + } +} \ No newline at end of file diff --git a/src/Backend/Program.cs b/src/Backend/Program.cs index 43c7f8b..5bace22 100644 --- a/src/Backend/Program.cs +++ b/src/Backend/Program.cs @@ -96,6 +96,9 @@ public static void Main(string[] args) services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); + services.AddSingleton(); + services.AddSingleton(); var app = builder.Build(); ILogger logger = app.Logger; diff --git a/src/Common/Enums/CosmosContainerEnum.cs b/src/Common/Enums/CosmosContainerEnum.cs index 538f048..cf53d10 100644 --- a/src/Common/Enums/CosmosContainerEnum.cs +++ b/src/Common/Enums/CosmosContainerEnum.cs @@ -8,6 +8,8 @@ namespace Common.Enums { public enum CosmosContainerEnum { - ProblemsContainer + ProblemsContainer, + JobsContainer, + ScrapperSettingsContainer } } diff --git a/src/Common/Factories/CosmosContainerFactory.cs b/src/Common/Factories/CosmosContainerFactory.cs index d526140..2253602 100644 --- a/src/Common/Factories/CosmosContainerFactory.cs +++ b/src/Common/Factories/CosmosContainerFactory.cs @@ -21,16 +21,24 @@ public CosmosContainerFactory(CosmosClient cosmosClient, IConfiguration configur public Container GetContainer(CosmosContainerEnum container) { var containerDetails = LoadContainerDetails(); + string dbId; + string containerId; switch (container) { case CosmosContainerEnum.ProblemsContainer: - var dbId = containerDetails[container].DatabaseName; - var containerId = containerDetails[container].ContainerName; - var db = _cosmosClient.GetDatabase(dbId); - return db.GetContainer(containerId); + dbId = containerDetails[container].DatabaseName; + containerId = containerDetails[container].ContainerName; + break; + case CosmosContainerEnum.JobsContainer: + dbId = "JobDataBase"; + containerId = "JobDetailsContainer"; + break; default: throw new ArgumentOutOfRangeException(nameof(container), container, null); } + + var db = _cosmosClient.GetDatabase(dbId); + return db.GetContainer(containerId); } private Dictionary LoadContainerDetails() diff --git a/src/Common/Helper.cs b/src/Common/Helper.cs new file mode 100644 index 0000000..435650e --- /dev/null +++ b/src/Common/Helper.cs @@ -0,0 +1,51 @@ +namespace Common.Helper +{ + using System; + using System.Collections.Generic; + using System.Text; + + public static class FastHashId + { + public static string GenerateHashId(string part1, string part2, string part3) + { + return GenerateHashId(new List { part1, part2, part3 }); + } + + public static string GenerateHashId(List input) + { + string combined = string.Join("|", input); + ulong hash = 14695981039346656037UL; // FNV offset basis + const ulong prime = 1099511628211UL; + + foreach (byte b in Encoding.UTF8.GetBytes(combined)) + { + hash ^= b; + hash *= prime; + } + + // Convert hash to Base36 (alphanumeric) for compactness + string base36 = ToBase36(hash); + + // Ensure it's exactly 10 characters + return base36.Length > 10 ? base36.Substring(0, 10) : base36.PadLeft(10, '0'); + } + + private static string ToBase36(ulong value) + { + const string chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + StringBuilder sb = new StringBuilder(); + while (value > 0) + { + sb.Insert(0, chars[(int)(value % 36)]); + value /= 36; + } + return sb.ToString(); + } + + public static string GenerateRandonGuid() + { + return Guid.NewGuid().ToString("N"); + } + } + +} diff --git a/src/Common/Models/JobScrapperSettings.cs b/src/Common/Models/JobScrapperSettings.cs new file mode 100644 index 0000000..d97b7a4 --- /dev/null +++ b/src/Common/Models/JobScrapperSettings.cs @@ -0,0 +1,83 @@ +namespace Common.Models +{ + public class JobScrapperSettings + { + public string Id { get; set; } + public bool Enabled { get; set; } + public DateTime LastUpdated { get; set; } + public DateTime LastRunTime { get; set; } + public int RunIntervalInHours { get; set; } + public QuerySettings Settings { get; set; } + + public JobScrapperSettings(string id, Models.Public.QuerySettings settings, bool enabled = false) + { + this.Id = id; + this.Enabled = enabled; + this.LastUpdated = DateTime.UtcNow; + this.LastRunTime = DateTime.MinValue; + this.RunIntervalInHours = 24; // Default to daily runs + this.Settings = new Models.QuerySettings(settings); + } + + public string GetQueryParameters() + { + return string.Empty; + } + + public void UpdateFromPublicModel(Models.Public.ScrapperSettings publicSettings) + { + this.Enabled = publicSettings.enabled; + this.RunIntervalInHours = publicSettings.runIntervalInHours; + this.Settings = new Models.QuerySettings(publicSettings.settings); + } + + public Models.Public.ScrapperSettings ToPublicModel() + { + return new Models.Public.ScrapperSettings + { + id = this.Id, + enabled = this.Enabled, + lastUpdated = this.LastUpdated, + lastRunTime = this.LastRunTime, + runIntervalInHours = this.RunIntervalInHours, + settings = new Models.Public.QuerySettings + { + query = this.Settings.Query, + location = this.Settings.Location, + siteToInclude = this.Settings.SiteToInclude, + siteToExclude = this.Settings.SiteToExclude, + exactTerms = this.Settings.ExactTerms, + negativeTerms = this.Settings.NegativeTerms + } + }; + } + + public override string ToString() + { + return $"JobScrapperSettings(Id={Id}, Enabled={Enabled}, LastUpdated={LastUpdated}, LastRunTime={LastRunTime}, RunIntervalInHours={RunIntervalInHours}, Settings=[Query={Settings.Query}, Location={Settings.Location}])"; + } + } + + public class QuerySettings + { + public string Query { get; set; } + public string Location { get; set; } + public string SiteToInclude { get; set; } + public string SiteToExclude { get; set; } + public string ExactTerms { get; set; } + public string NegativeTerms { get; set; } + public int lookBackDays = 1; + public string AdditionalSearchterms { get; set; } + + public QuerySettings(Models.Public.QuerySettings qs) + { + this.Query = qs.query; + this.Location = qs.location; + this.SiteToInclude = qs.siteToInclude; + this.SiteToExclude = qs.siteToExclude; + this.ExactTerms = qs.exactTerms; + this.NegativeTerms = qs.negativeTerms; + this.AdditionalSearchterms = qs.additionalTerms; + } + } +} \ No newline at end of file diff --git a/src/Common/Models/Public/ScrapperSettings.cs b/src/Common/Models/Public/ScrapperSettings.cs new file mode 100644 index 0000000..1a6a320 --- /dev/null +++ b/src/Common/Models/Public/ScrapperSettings.cs @@ -0,0 +1,23 @@ +namespace Common.Models.Public +{ + public class ScrapperSettings + { + public string id { get; set; } + public bool enabled { get; set; } + public DateTime lastUpdated { get; set; } + public DateTime lastRunTime { get; set; } + public int runIntervalInHours { get; set; } + public QuerySettings settings { get; set; } + } + + public class QuerySettings + { + public string query { get; set; } + public string location { get; set; } + public string siteToInclude { get; set; } + public string siteToExclude { get; set; } + public string exactTerms { get; set; } + public string negativeTerms { get; set; } + public string additionalTerms { get; set; } + } +} \ No newline at end of file diff --git a/src/Common/Models/ScrappedJob.cs b/src/Common/Models/ScrappedJob.cs index 21caa68..2758490 100644 --- a/src/Common/Models/ScrappedJob.cs +++ b/src/Common/Models/ScrappedJob.cs @@ -2,7 +2,7 @@ namespace Common.Models { public class ScrappedJob { - public string jobId { get; set; } + public string id { get; set; } public string title { get; set; } public string displayLink { get; set; } public string snippet { get; set; } @@ -18,20 +18,14 @@ public ScrappedJob(Item item, DateTime scrappedTime) this.displayLink = item.displayLink; this.snippet = item.snippet; this.link = item.link; - this.jobId = GenerateHashId(item.link, item.displayLink); + this.id = GenerateHashId(item.link, item.title, item.displayLink); this.scrappedTime = scrappedTime; this.description = "NA"; } - private string GenerateHashId(string url, string displayLink) + private string GenerateHashId(string v1, string v2, string v3) { - // Use a simple hash code and base36 encoding for lightweight hash - int hash = url.GetHashCode(); - string base36 = Math.Abs(hash).ToString("x"); // Hexadecimal representation - string dtime = DateTime.UtcNow.ToString("yyyyMMdd"); - // Pad or trim to 20 characters - var hashvalue = base36.Length > 10 ? base36.Substring(0, 10) : base36.PadLeft(10, '0'); - return $"{displayLink}-{dtime}-{hashvalue}"; + return Common.Helper.FastHashId.GenerateHashId(v1, v2, v3); } } } \ No newline at end of file diff --git a/src/Common/Repositories/JobsRepository.cs b/src/Common/Repositories/JobsRepository.cs new file mode 100644 index 0000000..f95d4b8 --- /dev/null +++ b/src/Common/Repositories/JobsRepository.cs @@ -0,0 +1,83 @@ +namespace Common.Repositories +{ + using Common.Enums; + using Common.Factories; + using Common.Models; + using Microsoft.Azure.Cosmos; + using Microsoft.Extensions.Logging; + + public class JobsRepository + { + private readonly Container jobsContainer; + private readonly ILogger logger; + + public JobsRepository(ICosmosContainerFactory cosmosContainerFactory, + ILogger logger) + { + this.jobsContainer = cosmosContainerFactory.GetContainer(CosmosContainerEnum.JobsContainer); + this.logger = logger; + } + + public async Task> GetAllLatestJobsAsync() + { + var query = "SELECT * FROM c ORDER BY c.scrappedTime DESC OFFSET 0 LIMIT 100"; + return await QueryJobsAsync(query); + } + + public async Task> GetAllJobsInLastOneDay() + { + var query = $"SELECT * FROM c WHERE DateTimeToTimestamp(GetCurrentTimestamp()) - DateTimeToTimestamp(c.scrappedTime) < 86400"; + return await QueryJobsAsync(query); + } + + public async Task GetJobByIdAsync(string id) + { + try + { + // TODO: NOT working as expected + var response = await this.jobsContainer.ReadItemAsync(id, new PartitionKey(id)); + return response.Resource; + } + catch (CosmosException cosmosEx) when (cosmosEx.StatusCode == System.Net.HttpStatusCode.NotFound) + { + this.logger.LogWarning($"Job: {id} not found in container."); + return null; + } + catch (Exception ex) + { + this.logger.LogError($"Failed to retrieve job: {id} from container. Ex: {ex}"); + return null; + } + } + + public async Task CreateOrUpdateJobAsync(ScrappedJob job) + { + try + { + // TODO: Do async inserts for faster performance + var res = await this.jobsContainer.UpsertItemAsync(job); + } + catch (Exception ex) + { + this.logger.LogError($"Failed to push job: {job.id} to container. Ex: {ex}"); + return false; + } + + return true; + } + + private async Task> QueryJobsAsync(string query) + { + var queryDefinition = new QueryDefinition(query); + var queryResultSetIterator = jobsContainer.GetItemQueryIterator(queryDefinition); + List results = new List(); + while (queryResultSetIterator.HasMoreResults) + { + var response = await queryResultSetIterator.ReadNextAsync(); + results.AddRange(response); + } + this.logger.LogInformation($"Retrieved {results.Count} jobs from Cosmos DB. Query: {query}"); + return results; + } + } +} From b733e456d75695347752c43f54d9a21989bf6e05 Mon Sep 17 00:00:00 2001 From: Ayush Behera Date: Sun, 19 Oct 2025 22:39:58 +0530 Subject: [PATCH 6/7] Refactoring code for async data pulls --- src/.dockerignore | 30 ++ .../Controllers/JobSearchController.cs | 85 ++---- .../Controllers/ScrapperSettingsController.cs | 58 ++++ src/Backend/Filters/IFilter.cs | 9 - src/Backend/Filters/ProblemFilter.cs | 3 +- src/Backend/Operations/DataProvider.cs | 3 +- src/Backend/Operations/GSEngine.cs | 149 ---------- src/Backend/Operations/JobScrapperManager.cs | 53 ---- .../Operations/JobScrapperSettingsManager.cs | 54 ---- src/Backend/Program.cs | 9 +- src/Backend/appsettings.json | 5 +- src/Common/Common.csproj | 3 + .../Constants/ConfigurationConstants.cs | 3 + .../DatabaseModels/JobScrapperSettings.cs | 82 ++++++ .../ProblemSchema.cs | 36 +-- src/Common/DatabaseModels/QuerySettings.cs | 33 +++ .../{Models => DatabaseModels}/ScrappedJob.cs | 22 +- .../Operations => Common/Engines}/AIEngine.cs | 81 +++--- src/Common/Engines/GSEngine.cs | 238 ++++++++++++++++ .../Factories/CosmosContainerFactory.cs | 40 +-- src/Common/IFilter.cs | 7 + .../Managers}/JobScrapper.cs | 19 +- .../Managers/JobScrapperSettingsManager.cs | 93 ++++++ src/Common/Models/JobScrapperSettings.cs | 83 ------ src/Common/Models/Problem.cs | 2 + src/Common/Models/Public/QuerySettings.cs | 15 + src/Common/Models/Public/ScrapperSettings.cs | 14 +- src/Common/Queries/JobQuery.cs | 12 + .../JobScrapperSettingsRepository.cs | 53 ++++ src/Common/Repositories/JobsRepository.cs | 131 ++++++++- src/Common/Repositories/ProblemRepository.cs | 3 +- src/PetProjectAzFunctions/.gitignore | 264 ++++++++++++++++++ src/PetProjectAzFunctions/Dockerfile | 29 ++ .../JobOpeningsSyncFunction.cs | 67 +++++ .../PetProjectAzFunctions.csproj | 37 +++ src/PetProjectAzFunctions/Program.cs | 53 ++++ .../Properties/launchSettings.json | 15 + .../Properties/serviceDependencies.json | 11 + src/PetProjectAzFunctions/host.json | 12 + src/PetProjectAzFunctions/readme.md | 11 + src/Synchronizer/ProblemsProcessor.cs | 1 + src/lcw.sln | 20 +- 42 files changed, 1418 insertions(+), 530 deletions(-) create mode 100644 src/.dockerignore create mode 100644 src/Backend/Controllers/ScrapperSettingsController.cs delete mode 100644 src/Backend/Filters/IFilter.cs delete mode 100644 src/Backend/Operations/GSEngine.cs delete mode 100644 src/Backend/Operations/JobScrapperManager.cs delete mode 100644 src/Backend/Operations/JobScrapperSettingsManager.cs create mode 100644 src/Common/DatabaseModels/JobScrapperSettings.cs rename src/Common/{Models => DatabaseModels}/ProblemSchema.cs (55%) create mode 100644 src/Common/DatabaseModels/QuerySettings.cs rename src/Common/{Models => DatabaseModels}/ScrappedJob.cs (54%) rename src/{Backend/Operations => Common/Engines}/AIEngine.cs (63%) create mode 100644 src/Common/Engines/GSEngine.cs create mode 100644 src/Common/IFilter.cs rename src/{Backend/Operations => Common/Managers}/JobScrapper.cs (79%) create mode 100644 src/Common/Managers/JobScrapperSettingsManager.cs delete mode 100644 src/Common/Models/JobScrapperSettings.cs create mode 100644 src/Common/Models/Public/QuerySettings.cs create mode 100644 src/Common/Queries/JobQuery.cs create mode 100644 src/Common/Repositories/JobScrapperSettingsRepository.cs create mode 100644 src/PetProjectAzFunctions/.gitignore create mode 100644 src/PetProjectAzFunctions/Dockerfile create mode 100644 src/PetProjectAzFunctions/JobOpeningsSyncFunction.cs create mode 100644 src/PetProjectAzFunctions/PetProjectAzFunctions.csproj create mode 100644 src/PetProjectAzFunctions/Program.cs create mode 100644 src/PetProjectAzFunctions/Properties/launchSettings.json create mode 100644 src/PetProjectAzFunctions/Properties/serviceDependencies.json create mode 100644 src/PetProjectAzFunctions/host.json create mode 100644 src/PetProjectAzFunctions/readme.md diff --git a/src/.dockerignore b/src/.dockerignore new file mode 100644 index 0000000..fe1152b --- /dev/null +++ b/src/.dockerignore @@ -0,0 +1,30 @@ +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/azds.yaml +**/bin +**/charts +**/docker-compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md +!**/.gitignore +!.git/HEAD +!.git/config +!.git/packed-refs +!.git/refs/heads/** \ No newline at end of file diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs index 47e329d..2ced804 100644 --- a/src/Backend/Controllers/JobSearchController.cs +++ b/src/Backend/Controllers/JobSearchController.cs @@ -2,102 +2,57 @@ namespace Backend.Controllers { using Backend.Operations; using Microsoft.AspNetCore.Mvc; - using Common.Models; using Common.Models.Public; using Common.Repositories; using System.Threading.Tasks; + using Common.Managers; + using Common.Engines; + using Common.Queries; + using Common.DatabaseModels; [ApiController] [Route("api")] public class JobSearchController : ControllerBase { - private readonly GSEngine gsEngine; - private readonly AIEngine aiEngine; - private readonly JobScrapperManager jobscrapperManager; - private readonly JobsRepository jobsContainer; + private readonly JobsRepository jobsRepository; private readonly ILogger logger; - public JobSearchController(GSEngine gsEngine, AIEngine aiEngine, JobsRepository jobsContainer, JobScrapperManager jobscrapperManager, ILogger logger) + public JobSearchController(JobsRepository jobsRepository, ILogger logger) { - this.gsEngine = gsEngine; - this.aiEngine = aiEngine; this.logger = logger; - this.jobscrapperManager = jobscrapperManager; - this.jobsContainer = jobsContainer; + this.jobsRepository = jobsRepository; } - [HttpGet] + [HttpPost] [Route("jobs/search")] - public async Task>> SearchJobs( - [FromQuery(Name = "q")] string query, - [FromQuery(Name = "d")] int nPreviousDays) + public async Task>> SearchJobs([FromBody] JobQuery jobquery) { - var result = await this.gsEngine.SearchBasicQueryAsync(query, nPreviousDays); - if (result != null) - { - var levels = await this.aiEngine.GetJobLevelAsync(result); - foreach (var level in levels) - { - var job = result.FirstOrDefault(j => j.id == level.Key); - if (job != null) - { - job.tags.Add(level.Value); - } - } - return Ok(result); - } - return StatusCode(500, "Error occurred while searching for jobs."); + return Ok(await jobsRepository.GetJobsFromQuery(jobquery)); } [HttpGet] [Route("jobs/latest")] - public async Task> GetLatestJobsFromScrapper() + public async Task> GetLatestJobsFromDb() { - return Ok(await this.jobsContainer.GetAllLatestJobsAsync()); + return Ok(await this.jobsRepository.GetAllLatestJobsAsync()); + } + + [HttpGet] + [Route("jobs/lastOneDay")] + public async Task> GetLastOneDayJobsFromDb() + { + return Ok(await this.jobsRepository.GetAllJobsInLastOneDay()); } [HttpGet] [Route("jobs/profile/{id}")] public async Task> GetJobById(string id) { - var job = await this.jobsContainer.GetJobByIdAsync(id); + var job = await this.jobsRepository.GetJobByIdAsync(id); if (job != null) { return Ok(job); } return Ok("Not found."); } - - [HttpGet] - [Route("jobs/scrappers")] - public ActionResult> GetAllJobScrappers() - { - // Placeholder implementation for getting all scrappers - return Ok(this.jobscrapperManager.settingsManager.GetAllSettings()); - } - - [HttpPut] - [Route("jobs/scrappers/{id}")] - public ActionResult CreateOrUpdateJobScrapperSettings(string id, [FromBody] ScrapperSettings settings) - { - // Placeholder implementation for updating scrapper settings - return Ok(this.jobscrapperManager.settingsManager.CreateOrUpdateSettings(id, settings)); - } - - [HttpGet] - [Route("jobs/scrappers/{id}")] - public ActionResult GetJobScrapperSettings(string id) - { - // Placeholder implementation for getting scrapper settings - return Ok(this.jobscrapperManager.settingsManager.GetSettingsById(id)); - } - - [HttpGet] - [Route("jobs/scrappers/{id}/trigger")] - public ActionResult TriggerScrapper(string id) - { - // Placeholder implementation for getting scrapper settings - this.jobscrapperManager.RunScrapperByIdAsync(id); - return Ok($"Started scrapper for settings id: {id}"); - } } } \ No newline at end of file diff --git a/src/Backend/Controllers/ScrapperSettingsController.cs b/src/Backend/Controllers/ScrapperSettingsController.cs new file mode 100644 index 0000000..3cbdc4b --- /dev/null +++ b/src/Backend/Controllers/ScrapperSettingsController.cs @@ -0,0 +1,58 @@ +using Common.DatabaseModels; +using Common.Engines; +using Common.Managers; +using Common.Models; +using Common.Models.Public; +using Common.Repositories; +using Microsoft.AspNetCore.Mvc; + +namespace Backend.Controllers +{ + [ApiController] + [Route("api/[controller]")] + public class ScrapperSettingsController : ControllerBase + { + private readonly JobScrapperSettingsManager _settingsManager; + + private readonly ILogger _logger; + + public ScrapperSettingsController( JobScrapperSettingsManager jobScrapperSettingsManager, + ILogger logger) + { + _settingsManager = jobScrapperSettingsManager; + _logger = logger; + } + + [HttpGet] + [Route("jobs/scrappers")] + public async Task>> GetAllJobScrappers() + { + // Placeholder implementation for getting all scrappers + return Ok(await _settingsManager.GetAllSettings()); + } + + [HttpPut] + [Route("jobs/scrappers/{id}")] + public async Task> UpdateJobScrapperSettings(string id, [FromBody] ScrapperSettings settings) + { + // Placeholder implementation for updating scrapper settings + return Ok(await _settingsManager.CreateOrUpdateSettings(id, settings)); + } + + [HttpPost] + [Route("jobs/scrappers/Add")] + public async Task> CreateNewJobScrapperSettings([FromBody] ScrapperSettings settings) + { + // Placeholder implementation for updating scrapper settings + return Ok(await _settingsManager.CreateOrUpdateSettings(string.Empty, settings)); + } + + [HttpGet] + [Route("jobs/scrappers/{id}")] + public async Task> GetJobScrapperSettings(string id) + { + // Placeholder implementation for getting scrapper settings + return Ok(await _settingsManager.GetSettingsById(id)); + } + } +} diff --git a/src/Backend/Filters/IFilter.cs b/src/Backend/Filters/IFilter.cs deleted file mode 100644 index 91a29e5..0000000 --- a/src/Backend/Filters/IFilter.cs +++ /dev/null @@ -1,9 +0,0 @@ -using Common.Models; - -namespace Backend.Filters -{ - public interface IFilter - { - public List ApplyFilterAsync(List problems); - } -} \ No newline at end of file diff --git a/src/Backend/Filters/ProblemFilter.cs b/src/Backend/Filters/ProblemFilter.cs index a4b66f5..4d73c5e 100644 --- a/src/Backend/Filters/ProblemFilter.cs +++ b/src/Backend/Filters/ProblemFilter.cs @@ -1,8 +1,9 @@ +using Common; using Common.Models; namespace Backend.Filters { - public class ProblemFilter : IFilter + public class ProblemFilter : IFilter { private int skip = 0; private int limit = 50; diff --git a/src/Backend/Operations/DataProvider.cs b/src/Backend/Operations/DataProvider.cs index c280dd1..04f80a0 100644 --- a/src/Backend/Operations/DataProvider.cs +++ b/src/Backend/Operations/DataProvider.cs @@ -1,6 +1,7 @@ namespace Backend.Operations { using Backend.Filters; + using Common; using Common.Cache; using Common.Constants; using Common.Models; @@ -15,7 +16,7 @@ public DataProvider([FromKeyedServices(CacheConstants.ProblemCacheKey)] ICache p _logger = logger; } - public async Task> GetProblemsAsync(IFilter? filter = null) + public async Task> GetProblemsAsync(IFilter? filter = null) { var allProblems = await GetAllProblemsAsync(); if (filter != null) diff --git a/src/Backend/Operations/GSEngine.cs b/src/Backend/Operations/GSEngine.cs deleted file mode 100644 index 5d59743..0000000 --- a/src/Backend/Operations/GSEngine.cs +++ /dev/null @@ -1,149 +0,0 @@ -namespace Backend.Operations -{ - using Common.Models; - using Common.Models.Public; - using Microsoft.AspNetCore.Mvc.ModelBinding; - using Newtonsoft.Json; - public class GSEngine - { - private readonly string apiKey; - private readonly string searchEngineId; - private readonly HttpClient httpClient; - private string baseUrl = "https://customsearch.googleapis.com/customsearch/v1"; - private int maxResultsPerSearch = 150; - ILogger logger; - - public GSEngine(IConfiguration configuration, ILogger _logger) - { - this.apiKey = configuration["GoogleSearch:ApiKey"] ?? throw new ArgumentNullException("Google Search API Key is not configured."); - this.searchEngineId = configuration["GoogleSearch:SearchEngineId"] ?? throw new ArgumentNullException("Google Search Engine ID is not configured."); - this.logger = _logger; - this.httpClient = new HttpClient(); - } - - public async Task> SearchQueryAsync(JobScrapperSettings settings) - { - var qsettings = settings.Settings; - var allJobs = new List(); - int startIndex = 1, totalResults = 0; - - var template = $"{this.baseUrl}?key={apiKey}&cx={searchEngineId}&q={Uri.EscapeDataString(qsettings.Query)}"; - template += AddDateRestrictionToQuery(qsettings.lookBackDays); - - if (!string.IsNullOrEmpty(qsettings.ExactTerms)) template += AddExactTermsToQuery(qsettings.ExactTerms); - if (!string.IsNullOrEmpty(qsettings.NegativeTerms)) template += AddNegativeTermToQuery(qsettings.NegativeTerms); - if (!string.IsNullOrEmpty(qsettings.Location)) template += AddClientLocationToQuery(qsettings.Location); - if (!string.IsNullOrEmpty(qsettings.SiteToInclude)) template += AddSiteSearchToQuery(qsettings.SiteToExclude); - if (!string.IsNullOrEmpty(qsettings.SiteToExclude)) template += AddExcludeSiteSearchFromQuery(qsettings.SiteToExclude); - if (!string.IsNullOrEmpty(qsettings.AdditionalSearchterms)) template += AddAdditionalSearchTerms(qsettings.AdditionalSearchterms); - - do - { - var url = template + AddStartIndexToQuery(startIndex); - var res = await SearchRawUrlAsync(url); - if (res == null) - { - logger.LogError("SearchAsync returned null result."); - break; - } - else if (string.IsNullOrEmpty(res.queries.request[0].totalResults) || res.items == null) - { - logger.LogInformation($"No results found for query: {url}"); - break; - } - - foreach (var item in res.items) - { - var job = new ScrappedJob(item, DateTime.UtcNow); - allJobs.Add(job); - } - - totalResults = int.Parse(res.queries.request[0].totalResults); - startIndex += res.queries.request[0].count; - } - while (startIndex < maxResultsPerSearch && startIndex < totalResults); - - this.logger.LogInformation($"Fetched {allJobs.Count} jobs. Total available: {totalResults}. Using url template: {template}"); - - return allJobs; - } - - public async Task> SearchBasicQueryAsync(string query, int nPreviousDays = 1) - { - var qsettings = new Common.Models.Public.QuerySettings - { - query = query, - additionalTerms = "India", - exactTerms = "Software Engineer", - negativeTerms = "Manager", - location = "India", - siteToExclude = "linkedin.com" - }; - var settings = new JobScrapperSettings("basic-search", qsettings, true); - settings.Settings.lookBackDays = nPreviousDays; - return await SearchQueryAsync(settings); - } - - public async Task SearchRawUrlAsync(string url) - { - try - { - var response = await httpClient.GetAsync(url); - response.EnsureSuccessStatusCode(); - var content = await response.Content.ReadAsStringAsync(); - return JsonConvert.DeserializeObject(content); - } - catch (Exception ex) - { - logger.LogError(ex, "Error occurred during Google Search API call."); - } - - return null; - } - - private string AddClientLocationToQuery(string location = "in") - { - return $"&gl={location}"; - } - - private string AddDateRestrictionToQuery(int previousNDays = 1) - { - return $"&dateRestrict=d{previousNDays}"; - } - - private string AddNegativeTermToQuery(string phrase = "manager") - { - return $"&excludeTerms={Uri.EscapeDataString(phrase)}"; - } - - private string AddExactTermsToQuery(string phrase = "Software Engineer") - { - return $"&exactTerms={Uri.EscapeDataString(phrase)}"; - } - - private string AddSiteSearchToQuery(string site = "linkedin.com") - { - return $"&siteSearch={site}&siteSearchFilter=i"; - } - - private string AddExcludeSiteSearchFromQuery(string site = "linkedin.com") - { - return $"&siteSearch={site}&siteSearchFilter=e"; - } - - private string AddSortingToQuery(string sort = "date") - { - return $"&sort={sort}"; - } - - private string AddAdditionalSearchTerms(string terms = "India") - { - return $"&hq={Uri.EscapeDataString(terms)}"; - } - - private string AddStartIndexToQuery(int startIndex = 1) - { - return $"&start={startIndex}"; - } - } -} \ No newline at end of file diff --git a/src/Backend/Operations/JobScrapperManager.cs b/src/Backend/Operations/JobScrapperManager.cs deleted file mode 100644 index 432e148..0000000 --- a/src/Backend/Operations/JobScrapperManager.cs +++ /dev/null @@ -1,53 +0,0 @@ -using Common.Repositories; - -namespace Backend.Operations -{ - public class JobScrapperManager - { - private readonly ILogger logger; - private readonly GSEngine gsEngine; - private readonly AIEngine aiEngine; - private readonly JobsRepository jobsContainer; - public readonly JobScrapperSettingsManager settingsManager; - - - public JobScrapperManager(ILogger logger, GSEngine gsEngine, AIEngine aiEngine, JobScrapperSettingsManager settingsManager, JobsRepository jobsRepo) - { - this.logger = logger; - this.gsEngine = gsEngine; - this.aiEngine = aiEngine; - this.settingsManager = settingsManager; - this.jobsContainer = jobsRepo; - } - - public async Task RunAllScrappersAsync() - { - - } - - public async Task RunScrapperByIdAsync(string id) - { - var settings = this.settingsManager.GetSettingsById(id); - if (settings.Enabled) - { - var scrapper = new JobScrapper(settings, this.gsEngine, this.aiEngine, this.jobsContainer, this.logger); - Task.Run(async () => - { - try - { - await scrapper.RunAsync(); - } - catch (Exception ex) - { - this.logger.LogError($"Error occurred while running scrapper with ID {id}: {ex.Message}"); - } - }); - this.settingsManager.UpdateLastRunTime(id, DateTime.UtcNow); - } - else - { - this.logger.LogWarning($"Scrapper with ID {id} is disabled. Skipping execution."); - } - } - } -} \ No newline at end of file diff --git a/src/Backend/Operations/JobScrapperSettingsManager.cs b/src/Backend/Operations/JobScrapperSettingsManager.cs deleted file mode 100644 index 4771902..0000000 --- a/src/Backend/Operations/JobScrapperSettingsManager.cs +++ /dev/null @@ -1,54 +0,0 @@ -namespace Backend.Operations -{ - using System.Collections.Concurrent; - using System.Globalization; - using System.Reflection.Metadata.Ecma335; - using Common.Models; - - public class JobScrapperSettingsManager - { - private ConcurrentDictionary settingsStore = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); - - public JobScrapperSettingsManager() {} - - public JobScrapperSettings CreateOrUpdateSettings(string id, Common.Models.Public.ScrapperSettings publicSettings) - { - var newSettings = new JobScrapperSettings( - id, - publicSettings.settings, - false); // Initially disabled - - settingsStore.AddOrUpdate(id, newSettings, (key, value) => - { - value.UpdateFromPublicModel(publicSettings); - value.LastUpdated = DateTime.UtcNow; - return value; - }); - - return settingsStore[id]; - } - - public JobScrapperSettings GetSettingsById(string id) - { - if(settingsStore.TryGetValue(id, out var settings)) - { - return settings; - } - return new JobScrapperSettings("NOT FOUND", new Common.Models.Public.QuerySettings(), false); - } - - public List GetAllSettings() - { - return settingsStore.Values.ToList(); - } - - public void UpdateLastRunTime(string id, DateTime runTime) - { - if(settingsStore.TryGetValue(id, out var settings)) - { - settings.LastRunTime = runTime; - settingsStore[id] = settings; - } - } - } -} \ No newline at end of file diff --git a/src/Backend/Program.cs b/src/Backend/Program.cs index 5bace22..8d434c8 100644 --- a/src/Backend/Program.cs +++ b/src/Backend/Program.cs @@ -3,7 +3,9 @@ namespace Backend; using Backend.Operations; using Common.Cache; using Common.Constants; +using Common.Engines; using Common.Factories; +using Common.Managers; using Common.Repositories; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Logging.ApplicationInsights; @@ -59,12 +61,14 @@ public static void Main(string[] args) { // Learn more about configuring Swagger/OpenAPI at https://aka.ms/aspnetcore/swashbuckle builder.Services.AddEndpointsApiExplorer(); - builder.Services.AddSwaggerGen(); + builder.Services.AddSwaggerGen(c => + { + c.CustomSchemaIds(type => type.FullName); + }); builder.Logging.AddConsole(); } - // Register AppContext as singleton var config = builder.Configuration; #region Register Cosmos related services @@ -98,7 +102,6 @@ public static void Main(string[] args) services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); - services.AddSingleton(); var app = builder.Build(); ILogger logger = app.Logger; diff --git a/src/Backend/appsettings.json b/src/Backend/appsettings.json index 0ea12c8..c1e8513 100644 --- a/src/Backend/appsettings.json +++ b/src/Backend/appsettings.json @@ -11,7 +11,10 @@ "CosmosDbUri": "https://lcw-cosmos.documents.azure.com:443/", "AccountKey": "", "LCProject:DatabaseName": "LeetCodeWrapper", - "LCProject:ContainerName": "Problems" + "LCProject:ContainerName": "Problems", + "JobProject:DatabaseName": "JobDataBase", + "JobProject:ContainerName": "JobDetailsContainer", + "JobProject:ScraperContainerName": "ScraperSettingsContainer" }, "ApplicationInsights": { "LogLevel": { diff --git a/src/Common/Common.csproj b/src/Common/Common.csproj index 844bdc5..91dccc4 100644 --- a/src/Common/Common.csproj +++ b/src/Common/Common.csproj @@ -13,6 +13,9 @@ + + + diff --git a/src/Common/Constants/ConfigurationConstants.cs b/src/Common/Constants/ConfigurationConstants.cs index bb21455..b286deb 100644 --- a/src/Common/Constants/ConfigurationConstants.cs +++ b/src/Common/Constants/ConfigurationConstants.cs @@ -14,6 +14,9 @@ public static class ConfigurationConstants public const string ApplicationSettings = "ApplicationSettings"; public const string LCProjectContainerNameKey = "LCProject:ContainerName"; public const string LCProjectDatabaseNameKey = "LCProject:DatabaseName"; + public const string JobsProjectContainerNameKey = "JobProject:ContainerName"; + public const string JobsScraperSettingsContainerNameKey = "JobProject:ScraperContainerName"; + public const string JobsProjectDatabaseNameKey = "JobProject:DatabaseName"; #endregion } } diff --git a/src/Common/DatabaseModels/JobScrapperSettings.cs b/src/Common/DatabaseModels/JobScrapperSettings.cs new file mode 100644 index 0000000..17f3c6d --- /dev/null +++ b/src/Common/DatabaseModels/JobScrapperSettings.cs @@ -0,0 +1,82 @@ +using System; +using Common.Models.Public; +using PublicSettingsModel = Common.Models.Public.QuerySettings; + +namespace Common.DatabaseModels +{ + public class JobScrapperSettings + { + public string id { get; set; } + + public string settingName { get; set; } + + public bool enabled { get; set; } + + public DateTime lastUpdated { get; set; } + + public DateTime lastRunTime { get; set; } + + public int runIntervalInMinutes { get; set; } + + public QuerySettings settings { get; set; } + + public JobScrapperSettings(string id, + string settingName, + int? runIntervalsInMinutes, + PublicSettingsModel settings, + bool enabled = false) + { + this.id = id; + this.settingName = settingName; + this.enabled = enabled; + this.lastUpdated = DateTime.UtcNow; + this.lastRunTime = DateTime.MinValue; + this.runIntervalInMinutes = Math.Min(60, runIntervalsInMinutes ?? 60); + this.settings = new QuerySettings(settings); + } + + public void UpdateFromPublicModel(ScrapperSettings publicSettings) + { + if (publicSettings == null) throw new ArgumentNullException(nameof(publicSettings)); + + this.enabled = publicSettings.enabled; + this.runIntervalInMinutes = publicSettings.runIntervalInMinutes; + this.settings = new QuerySettings(publicSettings.settings); + this.lastUpdated = DateTime.UtcNow; + // keep SettingName unchanged unless public model provides one + if (!string.IsNullOrWhiteSpace(publicSettings.name)) + { + this.settingName = publicSettings.name; + } + } + + public ScrapperSettings ToPublicModel() + { + return new ScrapperSettings + { + id = this.id, + name = this.settingName, + enabled = this.enabled, + lastUpdated = this.lastUpdated, + lastRunTime = this.lastRunTime, + runIntervalInMinutes = this.runIntervalInMinutes, + settings = new PublicSettingsModel + { + query = this.settings.query, + locations = this.settings.locations, + sitesToInclude = this.settings.sitesToInclude, + sitesToExclude = this.settings.sitesToExclude, + exactTerms = this.settings.exactTerms, + negativeTerms = this.settings.negativeTerms, + additionalTerms = this.settings.additionalSearchterms, + lookBackDays = this.settings.lookBackDays + } + }; + } + + public QuerySettings GetQuerySettings() + { + return this.settings; + } + } +} \ No newline at end of file diff --git a/src/Common/Models/ProblemSchema.cs b/src/Common/DatabaseModels/ProblemSchema.cs similarity index 55% rename from src/Common/Models/ProblemSchema.cs rename to src/Common/DatabaseModels/ProblemSchema.cs index edbc20d..137bb81 100644 --- a/src/Common/Models/ProblemSchema.cs +++ b/src/Common/DatabaseModels/ProblemSchema.cs @@ -1,30 +1,32 @@ -namespace Common.Models +using Common.Models; + +namespace Common.DatabaseModels { public class ProblemSchema { public ProblemSchema() { } public ProblemSchema(ProblemSchema ps) { - this.id = ps.id; - this.title = ps.title; - this.url = ps.url; - this.difficulty = ps.difficulty; - this.acceptance = ps.acceptance; - this.frequency = ps.frequency; - this.companyList = new List>>(); - this.metadataList = new List>(); + id = ps.id; + title = ps.title; + url = ps.url; + difficulty = ps.difficulty; + acceptance = ps.acceptance; + frequency = ps.frequency; + companyList = new List>>(); + metadataList = new List>(); } public ProblemSchema(Problem p) { - this.id = p.id; - this.title = p.title; - this.url = p.url; - this.difficulty = p.difficulty; - this.acceptance = p.acceptance; - this.frequency = p.frequency; - this.companyList = p.companies.Select(kv => new KeyValuePair>(kv.Key, kv.Value.ToList())).ToList(); - this.metadataList = p.metadata.Select(kv => new KeyValuePair(kv.Key, kv.Value)).ToList(); + id = p.id; + title = p.title; + url = p.url; + difficulty = p.difficulty; + acceptance = p.acceptance; + frequency = p.frequency; + companyList = p.companies.Select(kv => new KeyValuePair>(kv.Key, kv.Value.ToList())).ToList(); + metadataList = p.metadata.Select(kv => new KeyValuePair(kv.Key, kv.Value)).ToList(); } public string id { get; set; } = string.Empty; diff --git a/src/Common/DatabaseModels/QuerySettings.cs b/src/Common/DatabaseModels/QuerySettings.cs new file mode 100644 index 0000000..c03571a --- /dev/null +++ b/src/Common/DatabaseModels/QuerySettings.cs @@ -0,0 +1,33 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using PublicSettingsModel = Common.Models.Public.QuerySettings; + +namespace Common.DatabaseModels +{ + + public class QuerySettings + { + public string query { get; set; } + public List locations { get; set; } + public List sitesToInclude { get; set; } + public List sitesToExclude { get; set; } + public List exactTerms { get; set; } + public List negativeTerms { get; set; } + public int lookBackDays { get; set; } = 1; + public List additionalSearchterms { get; set; } + + public QuerySettings(PublicSettingsModel qs) + { + query = qs.query; + locations = qs.locations; + sitesToInclude = qs.sitesToInclude; + sitesToExclude = qs.sitesToExclude; + exactTerms = qs.exactTerms; + negativeTerms = qs.negativeTerms; + additionalSearchterms = qs.additionalTerms; + } + } +} diff --git a/src/Common/Models/ScrappedJob.cs b/src/Common/DatabaseModels/ScrappedJob.cs similarity index 54% rename from src/Common/Models/ScrappedJob.cs rename to src/Common/DatabaseModels/ScrappedJob.cs index 2758490..f334826 100644 --- a/src/Common/Models/ScrappedJob.cs +++ b/src/Common/DatabaseModels/ScrappedJob.cs @@ -1,4 +1,6 @@ -namespace Common.Models +using Common.Models; + +namespace Common.DatabaseModels { public class ScrappedJob { @@ -9,23 +11,27 @@ public class ScrappedJob public string description { get; set; } public string link { get; set; } public DateTime scrappedTime { get; set; } + public DateTime JobPostedTime { get; set; } + public string companyName { get; set; } + public string jobType { get; set; } + public string location { get; set; } public List tags { get; set; } = new List(); public ScrappedJob() { } public ScrappedJob(Item item, DateTime scrappedTime) { - this.title = item.title; - this.displayLink = item.displayLink; - this.snippet = item.snippet; - this.link = item.link; - this.id = GenerateHashId(item.link, item.title, item.displayLink); + title = item.title; + displayLink = item.displayLink; + snippet = item.snippet; + link = item.link; + id = GenerateHashId(item.link, item.title, item.displayLink); this.scrappedTime = scrappedTime; - this.description = "NA"; + description = "NA"; } private string GenerateHashId(string v1, string v2, string v3) { - return Common.Helper.FastHashId.GenerateHashId(v1, v2, v3); + return Helper.FastHashId.GenerateHashId(v1, v2, v3); } } } \ No newline at end of file diff --git a/src/Backend/Operations/AIEngine.cs b/src/Common/Engines/AIEngine.cs similarity index 63% rename from src/Backend/Operations/AIEngine.cs rename to src/Common/Engines/AIEngine.cs index 631336c..ee3fa22 100644 --- a/src/Backend/Operations/AIEngine.cs +++ b/src/Common/Engines/AIEngine.cs @@ -1,4 +1,4 @@ -namespace Backend.Operations +namespace Common.Engines { using Azure; using Azure.AI; @@ -7,8 +7,10 @@ namespace Backend.Operations using Azure.AI.Projects; using Azure.AI.Agents.Persistent; using System.Diagnostics; - using Common.Models; using Newtonsoft.Json; + using Microsoft.Extensions.Logging; + using Microsoft.Extensions.Configuration; + using Common.DatabaseModels; public class AIEngine { @@ -79,51 +81,52 @@ private async Task GetResponseInternalAsync(string input) { if (!IsReady()) { - this.logger.LogError($"AIEngine is not properly initialized. Given input: {input}"); - throw new InvalidOperationException("AIEngine is not properly initialized."); + logger.LogError($"AIEngine is not properly initialized. Input: {input}"); + throw new InvalidOperationException("AIEngine not initialized."); } - PersistentAgentThread thread = agentsClient.Threads.CreateThread(); - - PersistentThreadMessage messageResponse = agentsClient.Messages.CreateMessage( - thread.Id, - MessageRole.User, - input); + var threadResponse = await agentsClient.Threads.CreateThreadAsync(); + var thread = threadResponse.Value; - ThreadRun run = agentsClient.Runs.CreateRun( - thread.Id, - agent.Id); - - // Poll until the run reaches a terminal status - do - { - await Task.Delay(TimeSpan.FromMilliseconds(500)); - run = agentsClient.Runs.GetRun(thread.Id, run.Id); - } - while (run.Status == RunStatus.Queued - || run.Status == RunStatus.InProgress); - if (run.Status != RunStatus.Completed) + try { - this.logger.LogError($"Run failed or was canceled. ThreadId: {thread.Id} Last error: {run.LastError?.Message}"); - throw new InvalidOperationException($"Run failed or was canceled: {run.LastError?.Message}"); - } + await agentsClient.Messages.CreateMessageAsync(thread.Id, MessageRole.User, input); + var runResponse = await agentsClient.Runs.CreateRunAsync(thread.Id, agent.Id); + var run = runResponse.Value; - Pageable messages = agentsClient.Messages.GetMessages( - thread.Id, order: ListSortOrder.Ascending); - - string response = string.Empty; - PersistentThreadMessage lastThreadMessage = messages.Last(); + // Poll until terminal state + do + { + await Task.Delay(500); + run = await agentsClient.Runs.GetRunAsync(thread.Id, run.Id); + } + while (run.Status == RunStatus.Queued || run.Status == RunStatus.InProgress); - foreach (MessageContent contentItem in lastThreadMessage.ContentItems) - { - if (contentItem is MessageTextContent textItem) + if (run.Status != RunStatus.Completed) { - response += textItem.Text; + logger.LogError($"Run failed. ThreadId={thread.Id}, Error={run.LastError?.Message}"); + throw new InvalidOperationException($"Run failed: {run.LastError?.Message}"); + } + + // Fetch all messages in ascending order + var messages = agentsClient.Messages.GetMessagesAsync(thread.Id, order: ListSortOrder.Ascending); + + string response = string.Empty; + PersistentThreadMessage lastThreadMessage = messages.ToBlockingEnumerable().Last(); + foreach (MessageContent contentItem in lastThreadMessage.ContentItems) + { + if (contentItem is MessageTextContent textItem) + { + response += textItem.Text; + } } - } - agentsClient.Threads.DeleteThread(thread.Id); - return response; + return response; + } + finally + { + await agentsClient.Threads.DeleteThreadAsync(thread.Id); + } } - } + } } \ No newline at end of file diff --git a/src/Common/Engines/GSEngine.cs b/src/Common/Engines/GSEngine.cs new file mode 100644 index 0000000..231c96b --- /dev/null +++ b/src/Common/Engines/GSEngine.cs @@ -0,0 +1,238 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net.Http; +using System.Text; +using System.Threading.Tasks; +using Common.DatabaseModels; +using Common.Models; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; +using Newtonsoft.Json; + +namespace Common.Engines +{ + public class GSEngine + { + private readonly string apiKey; + private readonly string searchEngineId; + private readonly HttpClient httpClient; + private string baseUrl = "https://customsearch.googleapis.com/customsearch/v1"; + private int maxResultsPerSearch = 150; + private readonly ILogger logger; + + public GSEngine(IConfiguration configuration, ILogger _logger) + { + this.apiKey = configuration["GoogleSearch:ApiKey"] ?? throw new ArgumentNullException("Google Search API Key is not configured."); + this.searchEngineId = configuration["GoogleSearch:SearchEngineId"] ?? throw new ArgumentNullException("Google Search Engine ID is not configured."); + this.logger = _logger; + this.httpClient = new HttpClient(); + } + + public async Task> SearchQueryAsync(JobScrapperSettings settings) + { + if (settings == null) throw new ArgumentNullException(nameof(settings)); + + var qsettings = settings.GetQuerySettings() ?? throw new InvalidOperationException("Query settings cannot be null."); + var allJobs = new List(); + int startIndex = 1; + int totalResults = 0; + + var sb = new StringBuilder(); + sb.Append($"{this.baseUrl}?key={apiKey}&cx={searchEngineId}"); + + // base query + var baseQuery = qsettings.query ?? string.Empty; + sb.Append($"&q={Uri.EscapeDataString(baseQuery)}"); + + // date restriction + if (qsettings.lookBackDays > 0) + { + sb.Append(AddDateRestrictionToQuery(qsettings.lookBackDays)); + } + + // Exact terms (join list if provided) + if (qsettings.exactTerms != null && qsettings.exactTerms.Any()) + { + var exact = string.Join(" ", qsettings.exactTerms.Where(s => !string.IsNullOrWhiteSpace(s))); + if (!string.IsNullOrWhiteSpace(exact)) sb.Append(AddExactTermsToQuery(exact)); + } + + // Negative terms + if (qsettings.negativeTerms != null && qsettings.negativeTerms.Any()) + { + var neg = string.Join(" ", qsettings.negativeTerms.Where(s => !string.IsNullOrWhiteSpace(s))); + if (!string.IsNullOrWhiteSpace(neg)) sb.Append(AddNegativeTermToQuery(neg)); + } + + // Location - use first location if present (api uses gl for country) + if (qsettings.locations != null && qsettings.locations.Any() && !string.IsNullOrWhiteSpace(qsettings.locations.First())) + { + sb.Append(AddClientLocationToQuery(qsettings.locations.First())); + } + + // Site include / exclude - use first for siteSearch (API supports one siteSearch parameter) + if (qsettings.sitesToInclude != null && qsettings.sitesToInclude.Any() && !string.IsNullOrWhiteSpace(qsettings.sitesToInclude.First())) + { + sb.Append(AddSiteSearchToQuery(qsettings.sitesToInclude.First())); + } + else if (qsettings.sitesToExclude != null && qsettings.sitesToExclude.Any() && !string.IsNullOrWhiteSpace(qsettings.sitesToExclude.First())) + { + // prefer include if present; otherwise exclude + sb.Append(AddExcludeSiteSearchFromQuery(qsettings.sitesToExclude.First())); + } + + // Additional terms (hq) + if (qsettings.additionalSearchterms != null && qsettings.additionalSearchterms.Any()) + { + var add = string.Join(" ", qsettings.additionalSearchterms.Where(s => !string.IsNullOrWhiteSpace(s))); + if (!string.IsNullOrWhiteSpace(add)) sb.Append(AddadditionalSearchterms(add)); + } + + var template = sb.ToString(); + + do + { + var url = template + AddStartIndexToQuery(startIndex); + var res = await SearchRawUrlAsync(url); + if (res == null) + { + logger.LogError("SearchRawUrlAsync returned null for url: {url}", url); + break; + } + + // No items => stop + if (res.items == null || res.items.Count == 0) + { + logger.LogInformation("No items returned for url: {url}", url); + break; + } + + foreach (var item in res.items) + { + try + { + var job = new ScrappedJob(item, DateTime.UtcNow); + allJobs.Add(job); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Skipping item due to processing error."); + } + } + + // Determine total results + if (!string.IsNullOrWhiteSpace(res.searchInformation?.totalResults)) + { + if (!int.TryParse(res.searchInformation.totalResults, out totalResults)) + { + // try fallback to queries.request[0].totalResults + var reqTotal = res.queries?.request?.FirstOrDefault()?.totalResults; + if (!int.TryParse(reqTotal, out totalResults)) totalResults = int.MaxValue; + } + } + else + { + var reqTotal = res.queries?.request?.FirstOrDefault()?.totalResults; + if (!int.TryParse(reqTotal, out totalResults)) totalResults = int.MaxValue; + } + + // Advance to next page if present + if (res.queries?.nextPage != null && res.queries.nextPage.Count > 0) + { + var next = res.queries.nextPage[0]; + // Use next.startIndex if present; otherwise increment by count + if (next.startIndex > 0) + { + startIndex = next.startIndex; + } + else + { + var count = res.queries.request?.FirstOrDefault()?.count ?? res.items.Count; + if (count <= 0) break; + startIndex += count; + } + } + else + { + // no next page -> stop + break; + } + + // safety: prevent infinite looping + if (startIndex <= 0 || startIndex > maxResultsPerSearch) break; + } + while (startIndex <= maxResultsPerSearch && (totalResults == 0 || startIndex <= totalResults)); + + this.logger.LogInformation("Fetched {count} jobs. Total available (approx): {total}. Url template: {template}", allJobs.Count, totalResults, template); + return allJobs; + } + + public async Task SearchRawUrlAsync(string url) + { + try + { + var response = await httpClient.GetAsync(url); + if (!response.IsSuccessStatusCode) + { + logger.LogWarning("Google Search API returned status {status} for url {url}", response.StatusCode, url); + return null; + } + + var content = await response.Content.ReadAsStringAsync(); + return JsonConvert.DeserializeObject(content); + } + catch (Exception ex) + { + logger.LogError(ex, "Error occurred during Google Search API call."); + } + + return null; + } + + private string AddClientLocationToQuery(string location = "in") + { + return $"&gl={Uri.EscapeDataString(location)}"; + } + + private string AddDateRestrictionToQuery(int previousNDays = 1) + { + return $"&dateRestrict=d{previousNDays}"; + } + + private string AddNegativeTermToQuery(string phrase = "manager") + { + return $"&excludeTerms={Uri.EscapeDataString(phrase)}"; + } + + private string AddExactTermsToQuery(string phrase = "Software Engineer") + { + return $"&exactTerms={Uri.EscapeDataString(phrase)}"; + } + + private string AddSiteSearchToQuery(string site = "linkedin.com") + { + return $"&siteSearch={Uri.EscapeDataString(site)}&siteSearchFilter=i"; + } + + private string AddExcludeSiteSearchFromQuery(string site = "linkedin.com") + { + return $"&siteSearch={Uri.EscapeDataString(site)}&siteSearchFilter=e"; + } + + private string AddSortingToQuery(string sort = "date") + { + return $"&sort={Uri.EscapeDataString(sort)}"; + } + + private string AddadditionalSearchterms(string terms = "India") + { + return $"&hq={Uri.EscapeDataString(terms)}"; + } + + private string AddStartIndexToQuery(int startIndex = 1) + { + return $"&start={startIndex}"; + } + } +} \ No newline at end of file diff --git a/src/Common/Factories/CosmosContainerFactory.cs b/src/Common/Factories/CosmosContainerFactory.cs index 2253602..f54b062 100644 --- a/src/Common/Factories/CosmosContainerFactory.cs +++ b/src/Common/Factories/CosmosContainerFactory.cs @@ -3,6 +3,7 @@ using Common.Models.Miscellaneous; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; namespace Common.Factories { @@ -12,33 +13,30 @@ public class CosmosContainerFactory : ICosmosContainerFactory private readonly IConfiguration _configuration; - public CosmosContainerFactory(CosmosClient cosmosClient, IConfiguration configuration) + private readonly ILogger _logger; + public CosmosContainerFactory(CosmosClient cosmosClient, + IConfiguration configuration, + ILogger logger) { _cosmosClient = cosmosClient; _configuration = configuration; + _logger = logger; } public Container GetContainer(CosmosContainerEnum container) { var containerDetails = LoadContainerDetails(); - string dbId; - string containerId; - switch (container) + + if(!containerDetails.ContainsKey(container)) { - case CosmosContainerEnum.ProblemsContainer: - dbId = containerDetails[container].DatabaseName; - containerId = containerDetails[container].ContainerName; - break; - case CosmosContainerEnum.JobsContainer: - dbId = "JobDataBase"; - containerId = "JobDetailsContainer"; - break; - default: - throw new ArgumentOutOfRangeException(nameof(container), container, null); + _logger.LogError("Container details not found for container: {Container}", container); + throw new ArgumentOutOfRangeException(nameof(container), container, null); } - - var db = _cosmosClient.GetDatabase(dbId); - return db.GetContainer(containerId); + + var databaseName = containerDetails[container].DatabaseName; + var containerName = containerDetails[container].ContainerName; + var dbInstnace = _cosmosClient.GetDatabase(databaseName); + return dbInstnace.GetContainer(containerName); } private Dictionary LoadContainerDetails() @@ -49,6 +47,14 @@ private Dictionary LoadContainerDetails() { CosmosContainerEnum.ProblemsContainer, new ContainerDetails(config[ConfigurationConstants.LCProjectDatabaseNameKey], config[ConfigurationConstants.LCProjectContainerNameKey]) + }, + { + CosmosContainerEnum.JobsContainer, + new ContainerDetails(config[ConfigurationConstants.JobsProjectDatabaseNameKey], config[ConfigurationConstants.JobsProjectContainerNameKey]) + }, + { + CosmosContainerEnum.ScrapperSettingsContainer, + new ContainerDetails(config[ConfigurationConstants.JobsProjectDatabaseNameKey], config[ConfigurationConstants.JobsScraperSettingsContainerNameKey]) } }; } diff --git a/src/Common/IFilter.cs b/src/Common/IFilter.cs new file mode 100644 index 0000000..0a90e8c --- /dev/null +++ b/src/Common/IFilter.cs @@ -0,0 +1,7 @@ +namespace Common +{ + public interface IFilter + { + public List ApplyFilterAsync(List entities); + } +} \ No newline at end of file diff --git a/src/Backend/Operations/JobScrapper.cs b/src/Common/Managers/JobScrapper.cs similarity index 79% rename from src/Backend/Operations/JobScrapper.cs rename to src/Common/Managers/JobScrapper.cs index 349d9e7..f7f9a45 100644 --- a/src/Backend/Operations/JobScrapper.cs +++ b/src/Common/Managers/JobScrapper.cs @@ -1,23 +1,30 @@ -namespace Backend.Operations +namespace Common.Managers + { - using Common.Models; + using Common.DatabaseModels; + using Common.Engines; using Common.Repositories; + using Microsoft.Extensions.Logging; public class JobScrapper { private JobScrapperSettings settings; private GSEngine gsEngine; private AIEngine aiEngine; - private JobsRepository jobsContainer; + private JobsRepository jobsRepository; private ILogger logger; - public JobScrapper(JobScrapperSettings settings, GSEngine gsEngine, AIEngine aiEngine, JobsRepository jobsRepo, ILogger logger) + public JobScrapper(GSEngine gsEngine, AIEngine aiEngine, JobsRepository jobsRepo, ILogger logger) { this.logger = logger; this.gsEngine = gsEngine; this.aiEngine = aiEngine; + this.jobsRepository = jobsRepo; + } + + public void ConfigureSettings(JobScrapperSettings settings) + { this.settings = settings; - this.jobsContainer = jobsRepo; } public async Task RunAsync() @@ -49,7 +56,7 @@ public async Task RunAsync() foreach (var job in searchResults) { - var success = await this.jobsContainer.CreateOrUpdateJobAsync(job); + var success = await this.jobsRepository.CreateIfNotExistsAsync(job); if (!success) { this.logger.LogError($"Failed to push job {job.id} to JobsRepository."); diff --git a/src/Common/Managers/JobScrapperSettingsManager.cs b/src/Common/Managers/JobScrapperSettingsManager.cs new file mode 100644 index 0000000..3d434f7 --- /dev/null +++ b/src/Common/Managers/JobScrapperSettingsManager.cs @@ -0,0 +1,93 @@ +namespace Common.Managers +{ + using Common.DatabaseModels; + using Common.Enums; + using Common.Factories; + using Common.Models.Public; + using Microsoft.Azure.Cosmos; + using Microsoft.Extensions.Logging; + public class JobScrapperSettingsManager + { + private readonly Container _scrapperSettingsContainer; + private readonly ILogger _logger; + + public JobScrapperSettingsManager(ICosmosContainerFactory cosmosContainerFactory, + ILogger logger) + { + _scrapperSettingsContainer = cosmosContainerFactory.GetContainer(CosmosContainerEnum.ScrapperSettingsContainer); + _logger = logger; + } + + public async Task CreateOrUpdateSettings(string id, ScrapperSettings publicSettings) + { + if(publicSettings == null) + { + throw new ArgumentNullException(nameof(publicSettings), "Public settings cannot be null"); + } + + var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT TOP 1* from ScraperSettingsContainer where Id = {id}"); + + int count = 0; + var existingSettingsList = new List(); + var returnSettings = default(JobScrapperSettings); + while (settingsInDb.HasMoreResults) + { + var response = await settingsInDb.ReadNextAsync(); + existingSettingsList.AddRange(response); + } + + if(count > 0) + { + var existingSettings = existingSettingsList[0]; + existingSettings.UpdateFromPublicModel(publicSettings); + await _scrapperSettingsContainer.ReplaceItemAsync( + existingSettings, + existingSettings.id + ); + returnSettings = existingSettings; + } + else + { + id = Guid.NewGuid().ToString(); + returnSettings = await _scrapperSettingsContainer.CreateItemAsync( + new JobScrapperSettings( + id, + publicSettings.name, + publicSettings.runIntervalInMinutes, + publicSettings.settings, + true) + ); + } + + return returnSettings; + } + + public async Task GetSettingsById(string id) + { + var setting = await _scrapperSettingsContainer.ReadItemAsync( + id, + new PartitionKey(id) + ); + + if(setting == null) + { + _logger.LogError($"No JobScrapperSettings found with id: {id}"); + throw new KeyNotFoundException($"No JobScrapperSettings found with id: {id}"); + } + + return setting; + } + + public async Task> GetAllSettings() + { + var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT * from ScraperSettingsContainer"); + var allSettings = new List(); + while (settingsInDb.HasMoreResults) + { + var response = await settingsInDb.ReadNextAsync(); + allSettings.AddRange(response); + } + return allSettings; + } + } +} \ No newline at end of file diff --git a/src/Common/Models/JobScrapperSettings.cs b/src/Common/Models/JobScrapperSettings.cs deleted file mode 100644 index d97b7a4..0000000 --- a/src/Common/Models/JobScrapperSettings.cs +++ /dev/null @@ -1,83 +0,0 @@ -namespace Common.Models -{ - public class JobScrapperSettings - { - public string Id { get; set; } - public bool Enabled { get; set; } - public DateTime LastUpdated { get; set; } - public DateTime LastRunTime { get; set; } - public int RunIntervalInHours { get; set; } - public QuerySettings Settings { get; set; } - - public JobScrapperSettings(string id, Models.Public.QuerySettings settings, bool enabled = false) - { - this.Id = id; - this.Enabled = enabled; - this.LastUpdated = DateTime.UtcNow; - this.LastRunTime = DateTime.MinValue; - this.RunIntervalInHours = 24; // Default to daily runs - this.Settings = new Models.QuerySettings(settings); - } - - public string GetQueryParameters() - { - return string.Empty; - } - - public void UpdateFromPublicModel(Models.Public.ScrapperSettings publicSettings) - { - this.Enabled = publicSettings.enabled; - this.RunIntervalInHours = publicSettings.runIntervalInHours; - this.Settings = new Models.QuerySettings(publicSettings.settings); - } - - public Models.Public.ScrapperSettings ToPublicModel() - { - return new Models.Public.ScrapperSettings - { - id = this.Id, - enabled = this.Enabled, - lastUpdated = this.LastUpdated, - lastRunTime = this.LastRunTime, - runIntervalInHours = this.RunIntervalInHours, - settings = new Models.Public.QuerySettings - { - query = this.Settings.Query, - location = this.Settings.Location, - siteToInclude = this.Settings.SiteToInclude, - siteToExclude = this.Settings.SiteToExclude, - exactTerms = this.Settings.ExactTerms, - negativeTerms = this.Settings.NegativeTerms - } - }; - } - - public override string ToString() - { - return $"JobScrapperSettings(Id={Id}, Enabled={Enabled}, LastUpdated={LastUpdated}, LastRunTime={LastRunTime}, RunIntervalInHours={RunIntervalInHours}, Settings=[Query={Settings.Query}, Location={Settings.Location}])"; - } - } - - public class QuerySettings - { - public string Query { get; set; } - public string Location { get; set; } - public string SiteToInclude { get; set; } - public string SiteToExclude { get; set; } - public string ExactTerms { get; set; } - public string NegativeTerms { get; set; } - public int lookBackDays = 1; - public string AdditionalSearchterms { get; set; } - - public QuerySettings(Models.Public.QuerySettings qs) - { - this.Query = qs.query; - this.Location = qs.location; - this.SiteToInclude = qs.siteToInclude; - this.SiteToExclude = qs.siteToExclude; - this.ExactTerms = qs.exactTerms; - this.NegativeTerms = qs.negativeTerms; - this.AdditionalSearchterms = qs.additionalTerms; - } - } -} \ No newline at end of file diff --git a/src/Common/Models/Problem.cs b/src/Common/Models/Problem.cs index 27b6f19..714968f 100644 --- a/src/Common/Models/Problem.cs +++ b/src/Common/Models/Problem.cs @@ -1,3 +1,5 @@ +using Common.DatabaseModels; + namespace Common.Models { public enum Difficulty diff --git a/src/Common/Models/Public/QuerySettings.cs b/src/Common/Models/Public/QuerySettings.cs new file mode 100644 index 0000000..682021b --- /dev/null +++ b/src/Common/Models/Public/QuerySettings.cs @@ -0,0 +1,15 @@ +namespace Common.Models.Public +{ + + public class QuerySettings + { + public string query { get; set; } + public List locations { get; set; } + public List sitesToInclude { get; set; } + public List sitesToExclude { get; set; } + public List exactTerms { get; set; } + public List negativeTerms { get; set; } + public List additionalTerms { get; set; } + public int lookBackDays { get; set; } + } +} diff --git a/src/Common/Models/Public/ScrapperSettings.cs b/src/Common/Models/Public/ScrapperSettings.cs index 1a6a320..85d424b 100644 --- a/src/Common/Models/Public/ScrapperSettings.cs +++ b/src/Common/Models/Public/ScrapperSettings.cs @@ -3,21 +3,11 @@ namespace Common.Models.Public public class ScrapperSettings { public string id { get; set; } + public string name { get; set; } public bool enabled { get; set; } public DateTime lastUpdated { get; set; } public DateTime lastRunTime { get; set; } - public int runIntervalInHours { get; set; } + public int runIntervalInMinutes { get; set; } public QuerySettings settings { get; set; } } - - public class QuerySettings - { - public string query { get; set; } - public string location { get; set; } - public string siteToInclude { get; set; } - public string siteToExclude { get; set; } - public string exactTerms { get; set; } - public string negativeTerms { get; set; } - public string additionalTerms { get; set; } - } } \ No newline at end of file diff --git a/src/Common/Queries/JobQuery.cs b/src/Common/Queries/JobQuery.cs new file mode 100644 index 0000000..3bcce34 --- /dev/null +++ b/src/Common/Queries/JobQuery.cs @@ -0,0 +1,12 @@ +namespace Common.Queries +{ + public class JobQuery + { + public string JobType { get; set; } // Software Engineer, Data Scientist, etc. + public DateTime StartDate { get; set; } = DateTime.UtcNow; // Start date for the job posting + public DateTime EndDate { get; set; } = DateTime.UtcNow; // End date for the job posting + public List Companies { get; set; } // List of companies to filter + public List Locations { get; set; } // List of locations to filter + public string JobLevel { get; set; } // Entry Level, Mid Level, Senior Level, etc. + } +} diff --git a/src/Common/Repositories/JobScrapperSettingsRepository.cs b/src/Common/Repositories/JobScrapperSettingsRepository.cs new file mode 100644 index 0000000..8b2edb0 --- /dev/null +++ b/src/Common/Repositories/JobScrapperSettingsRepository.cs @@ -0,0 +1,53 @@ +using Common.DatabaseModels; +using Common.Enums; +using Common.Factories; +using Common.Managers; +using Microsoft.Azure.Cosmos; +using Microsoft.Extensions.Logging; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Common.Repositories +{ + public class JobScrapperSettingsRepository + { + private readonly Container _scrapperSettingsContainer; + private readonly ILogger _logger; + + public JobScrapperSettingsRepository(ICosmosContainerFactory cosmosContainerFactory, + ILogger logger) + { + _scrapperSettingsContainer = cosmosContainerFactory.GetContainer(CosmosContainerEnum.ScrapperSettingsContainer); + _logger = logger; + } + + public async Task> GetAllSettings() + { + var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT * from JobScrapperSettings"); + var allSettings = new List(); + while (settingsInDb.HasMoreResults) + { + var response = await settingsInDb.ReadNextAsync(); + allSettings.AddRange(response); + } + return allSettings; + } + + public async Task UpdateSettingsAsync(string id, JobScrapperSettings jobSetting) + { + try + { + await _scrapperSettingsContainer.UpsertItemAsync(jobSetting, new PartitionKey(id)); + _logger.LogInformation($"Successfully updated JobScrapperSettings with id: {id}"); + } + catch (Exception ex) + { + _logger.LogError($"Error updating JobScrapperSettings with id: {id}. Exception: {ex.Message}"); + throw; + } + } + } +} diff --git a/src/Common/Repositories/JobsRepository.cs b/src/Common/Repositories/JobsRepository.cs index f95d4b8..3086fdb 100644 --- a/src/Common/Repositories/JobsRepository.cs +++ b/src/Common/Repositories/JobsRepository.cs @@ -1,10 +1,12 @@ namespace Common.Repositories { + using Common.DatabaseModels; using Common.Enums; using Common.Factories; - using Common.Models; + using Common.Queries; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Logging; + using System.Net; public class JobsRepository { @@ -50,22 +52,41 @@ public async Task GetJobByIdAsync(string id) } } - public async Task CreateOrUpdateJobAsync(ScrappedJob job) + /// + /// Create the item only if it does not already exist using a single DB call. + /// Returns true if the item was created, false if it already existed. + /// + public async Task CreateIfNotExistsAsync(ScrappedJob job) { + if (job == null) throw new ArgumentNullException(nameof(job)); try { - // TODO: Do async inserts for faster performance - var res = await this.jobsContainer.UpsertItemAsync(job); + var requestOptions = new ItemRequestOptions + { + // Instruct Cosmos to only create if the item does not exist. + // SDK will translate this to an If-None-Match header. + IfNoneMatchEtag = "*" + }; + + var response = await this.jobsContainer.CreateItemAsync(job, new PartitionKey(job.id), requestOptions); + // Created successfully + this.logger.LogInformation("Created job {id} in Cosmos DB. RU charge: {ru}", job.id, response.RequestCharge); + return true; } - catch (Exception ex) + catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.PreconditionFailed || ex.StatusCode == HttpStatusCode.Conflict) { - this.logger.LogError($"Failed to push job: {job.id} to container. Ex: {ex}"); + // Item already exists (server enforces the If-None-Match precondition). + this.logger.LogInformation("Job {id} already exists. Skipping create.", job.id); return false; } - - return true; + catch (Exception ex) + { + this.logger.LogError(ex, "Failed to create job {id} in Cosmos DB.", job.id); + throw; + } } + private async Task> QueryJobsAsync(string query) { var queryDefinition = new QueryDefinition(query); @@ -79,5 +100,99 @@ private async Task> QueryJobsAsync(string query) this.logger.LogInformation($"Retrieved {results.Count} jobs from Cosmos DB. Query: {query}"); return results; } + private async Task> QueryJobsAsync(QueryDefinition queryDefinition) + { + var queryResultSetIterator = jobsContainer.GetItemQueryIterator(queryDefinition); + List results = new List(); + while (queryResultSetIterator.HasMoreResults) + { + var response = await queryResultSetIterator.ReadNextAsync(); + results.AddRange(response); + } + this.logger.LogInformation($"Retrieved {results.Count} jobs from Cosmos DB."); + return results; + } + + public async Task> GetJobsFromQuery(JobQuery jobquery) + { + if (jobquery == null) throw new ArgumentNullException(nameof(jobquery)); + + var sql = "SELECT * FROM c WHERE 1=1"; + var qd = new QueryDefinition(sql); + + // JobType: search title or tags + if (!string.IsNullOrWhiteSpace(jobquery.JobType)) + { + qd = qd.WithParameter("@jobType", jobquery.JobType); + sql += " AND CONTAINS(c.jobType, @jobType, true)"; + } + + // Companies (list) + if (jobquery.Companies != null && jobquery.Companies.Count > 0) + { + var companyConditions = new List(); + for (int i = 0; i < jobquery.Companies.Count; i++) + { + var param = $"@company{i}"; + qd = qd.WithParameter(param, jobquery.Companies[i]); + companyConditions.Add($"c.companyName = {param}"); + } + sql += " AND (" + string.Join(" OR ", companyConditions) + ")"; + } + + // Locations: fallback to searching in displayLink, snippet or description + if (jobquery.Locations != null && jobquery.Locations.Count > 0) + { + var locationConditions = new List(); + for (int i = 0; i < jobquery.Locations.Count; i++) + { + var param = $"@location{i}"; + qd = qd.WithParameter(param, jobquery.Locations[i]); + locationConditions.Add($"CONTAINS(c.location, {param}, true)"); + } + sql += " AND (" + string.Join(" OR ", locationConditions) + ")"; + } + + // JobLevel: search in tags array (case-insensitive contains) + if (!string.IsNullOrWhiteSpace(jobquery.JobLevel)) + { + qd = qd.WithParameter("@jobLevel", jobquery.JobLevel); + // Use EXISTS with an IN on the tags array and CONTAINS for case-insensitive matching + sql += " AND EXISTS(SELECT VALUE t FROM t IN c.tags WHERE CONTAINS(t, @jobLevel, true))"; + } + + // Date range (JobPostedTime) + if (jobquery.StartDate > DateTime.MinValue) + { + qd = qd.WithParameter("@startDate", jobquery.StartDate); + sql += " AND c.jobPostedTime >= @startDate"; + } + if (jobquery.EndDate > DateTime.MinValue) + { + qd = qd.WithParameter("@endDate", jobquery.EndDate); + sql += " AND c.jobPostedTime <= @endDate"; + } + + // final ordering / limit - optional, keep callers responsible if needed + qd = new QueryDefinition(sql); // rebuild with final SQL + // re-add parameters (QueryDefinition is immutable-like with chaining, but to keep it simple rebuild) + // Add parameters again + if (!string.IsNullOrWhiteSpace(jobquery.JobType)) qd = qd.WithParameter("@jobType", jobquery.JobType); + if (jobquery.Companies != null) + { + for (int i = 0; i < jobquery.Companies.Count; i++) qd = qd.WithParameter($"@company{i}", jobquery.Companies[i]); + } + if (jobquery.Locations != null) + { + for (int i = 0; i < jobquery.Locations.Count; i++) qd = qd.WithParameter($"@location{i}", jobquery.Locations[i]); + } + if (!string.IsNullOrWhiteSpace(jobquery.JobLevel)) qd = qd.WithParameter("@jobLevel", jobquery.JobLevel); + if (jobquery.StartDate > DateTime.MinValue) qd = qd.WithParameter("@startDate", jobquery.StartDate); + if (jobquery.EndDate > DateTime.MinValue) qd = qd.WithParameter("@endDate", jobquery.EndDate); + + logger.LogInformation($"Constructed job query: {sql}"); + + return await QueryJobsAsync(qd); + } } } diff --git a/src/Common/Repositories/ProblemRepository.cs b/src/Common/Repositories/ProblemRepository.cs index 8ca5d29..1d5fa79 100644 --- a/src/Common/Repositories/ProblemRepository.cs +++ b/src/Common/Repositories/ProblemRepository.cs @@ -1,4 +1,5 @@ -using Common.Enums; +using Common.DatabaseModels; +using Common.Enums; using Common.Factories; using Common.Models; using Microsoft.Azure.Cosmos; diff --git a/src/PetProjectAzFunctions/.gitignore b/src/PetProjectAzFunctions/.gitignore new file mode 100644 index 0000000..ff5b00c --- /dev/null +++ b/src/PetProjectAzFunctions/.gitignore @@ -0,0 +1,264 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# Azure Functions localsettings file +local.settings.json + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ + +# Visual Studio 2015 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# DNX +project.lock.json +project.fragment.lock.json +artifacts/ + +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# TODO: Comment the next line if you want to checkin your web deploy settings +# but database connection strings (with potential passwords) will be unencrypted +#*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/packages/* +# except build/, which is used as an MSBuild target. +!**/packages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/packages/repositories.config +# NuGet v3's project.json files produces more ignoreable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +node_modules/ +orleans.codegen.cs + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +.idea/ +*.sln.iml + +# CodeRush +.cr/ + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc \ No newline at end of file diff --git a/src/PetProjectAzFunctions/Dockerfile b/src/PetProjectAzFunctions/Dockerfile new file mode 100644 index 0000000..61200dd --- /dev/null +++ b/src/PetProjectAzFunctions/Dockerfile @@ -0,0 +1,29 @@ +# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging. + +# This stage is used when running from VS in fast mode (Default for Debug configuration) +FROM mcr.microsoft.com/azure-functions/dotnet-isolated:4-dotnet-isolated8.0 AS base +WORKDIR /home/site/wwwroot +EXPOSE 8080 + + +# This stage is used to build the service project +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build +ARG BUILD_CONFIGURATION=Release +WORKDIR /src +COPY ["PetProjectAzFunctions/PetProjectAzFunctions.csproj", "PetProjectAzFunctions/"] +RUN dotnet restore "./PetProjectAzFunctions/PetProjectAzFunctions.csproj" +COPY . . +WORKDIR "/src/PetProjectAzFunctions" +RUN dotnet build "./PetProjectAzFunctions.csproj" -c $BUILD_CONFIGURATION -o /app/build + +# This stage is used to publish the service project to be copied to the final stage +FROM build AS publish +ARG BUILD_CONFIGURATION=Release +RUN dotnet publish "./PetProjectAzFunctions.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false + +# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration) +FROM base AS final +WORKDIR /home/site/wwwroot +COPY --from=publish /app/publish . +ENV AzureWebJobsScriptRoot=/home/site/wwwroot \ + AzureFunctionsJobHost__Logging__Console__IsEnabled=true \ No newline at end of file diff --git a/src/PetProjectAzFunctions/JobOpeningsSyncFunction.cs b/src/PetProjectAzFunctions/JobOpeningsSyncFunction.cs new file mode 100644 index 0000000..a87b8db --- /dev/null +++ b/src/PetProjectAzFunctions/JobOpeningsSyncFunction.cs @@ -0,0 +1,67 @@ +using System; +using Common.Managers; +using Common.Repositories; +using Microsoft.Azure.Functions.Worker; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; + +namespace PetProjectAzFunctions +{ + public class JobOpeningsSyncFunction + { + private readonly ILogger _logger; + + private readonly JobScrapperSettingsRepository _jobScrapperSettingsRepository; + + private readonly IServiceProvider _serviceProvider; + + public JobOpeningsSyncFunction(ILoggerFactory loggerFactory, + JobScrapperSettingsRepository jobScrapperSettingsRepository, + IServiceProvider serviceProvider) + { + _logger = loggerFactory.CreateLogger(); + _jobScrapperSettingsRepository = jobScrapperSettingsRepository; + _serviceProvider = serviceProvider; + } + + [Function("JobOpeningsSyncFunction")] + public async Task Run([TimerTrigger("%CronPeriod%")] TimerInfo myTimer) + { + _logger.LogInformation($"C# Timer trigger function executed at: {DateTime.Now}"); + var scrapperSettings = await _jobScrapperSettingsRepository.GetAllSettings(); + var currentTime = DateTime.UtcNow; + await Parallel.ForEachAsync(scrapperSettings, async (setting, ct) => + { + try + { + if (setting.enabled) + { + if(setting.lastRunTime.AddMinutes(setting.runIntervalInMinutes) >= currentTime.AddMinutes(-1)) + { + using var scope = _serviceProvider.CreateScope(); + var scrapperInstance = scope.ServiceProvider.GetRequiredService(); + scrapperInstance.ConfigureSettings(setting); + await scrapperInstance.RunAsync(); + setting.lastRunTime = currentTime; + await _jobScrapperSettingsRepository.UpdateSettingsAsync(setting.id, setting); + } + else + { + _logger.LogInformation($"Scrapper setting {setting.id} was run at {setting.lastRunTime}, next run schedule has not yet come. Skipping this run."); + } + } + else + { + _logger.LogInformation($"Scrapper setting {setting.id} is disabled. Skipping."); + return; + } + } + catch (Exception ex) + { + _logger.LogError(ex, $"Error processing scrapper settings: {setting}"); + } + }); + } + } +} diff --git a/src/PetProjectAzFunctions/PetProjectAzFunctions.csproj b/src/PetProjectAzFunctions/PetProjectAzFunctions.csproj new file mode 100644 index 0000000..873e3b4 --- /dev/null +++ b/src/PetProjectAzFunctions/PetProjectAzFunctions.csproj @@ -0,0 +1,37 @@ + + + net8.0 + v4 + Exe + enable + enable + /home/site/wwwroot + Linux + + + + + + + + + + + + + + + + + + PreserveNewest + + + PreserveNewest + Never + + + + + + \ No newline at end of file diff --git a/src/PetProjectAzFunctions/Program.cs b/src/PetProjectAzFunctions/Program.cs new file mode 100644 index 0000000..7de0b9e --- /dev/null +++ b/src/PetProjectAzFunctions/Program.cs @@ -0,0 +1,53 @@ +using Common.Constants; +using Common.Engines; +using Common.Factories; +using Common.Managers; +using Common.Repositories; +using Microsoft.Azure.Cosmos; +using Microsoft.Azure.Functions.Worker.Builder; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using System.Data; + +public class Program +{ + public static void Main(string[] args) + { + var builder = FunctionsApplication.CreateBuilder(args); + + builder.ConfigureFunctionsWebApplication(); + ConfigureServices(builder); + builder.Build().Run(); + } + + private static void ConfigureServices(FunctionsApplicationBuilder builder) + { + var services = builder.Services; + // Register your services here + services.AddLogging(); + services.AddHttpClient(); + services.AddTransient(); + services.AddTransient(); + services.AddTransient(); + services.AddTransient(); + services.AddTransient(); + + var config = builder.Configuration; + + #region Register Cosmos related services + services.AddSingleton(s => + { + var cosmosDbUri = config[ConfigurationConstants.CosmosDBUriKey]; + var cosmosDbAccountKey = config[ConfigurationConstants.CosmosDBAccountKey]; + if (string.IsNullOrEmpty(cosmosDbUri) || string.IsNullOrEmpty(cosmosDbAccountKey)) + { + throw new DataException("Cosmos DB configuration is missing or invalid."); + } + return new CosmosClient(cosmosDbUri, cosmosDbAccountKey); + }); + + services.AddTransient(); + #endregion + + } +} diff --git a/src/PetProjectAzFunctions/Properties/launchSettings.json b/src/PetProjectAzFunctions/Properties/launchSettings.json new file mode 100644 index 0000000..6a6168e --- /dev/null +++ b/src/PetProjectAzFunctions/Properties/launchSettings.json @@ -0,0 +1,15 @@ +{ + "profiles": { + "PetProjectAzFunctions": { + "commandName": "Project", + "commandLineArgs": "--port 7149" + }, + "Container (Dockerfile)": { + "commandName": "Docker", + "launchUrl": "{Scheme}://{ServiceHost}:{ServicePort}", + "containerRunArguments": "--init", + "httpPort": 31027, + "useSSL": false + } + } +} \ No newline at end of file diff --git a/src/PetProjectAzFunctions/Properties/serviceDependencies.json b/src/PetProjectAzFunctions/Properties/serviceDependencies.json new file mode 100644 index 0000000..df4dcc9 --- /dev/null +++ b/src/PetProjectAzFunctions/Properties/serviceDependencies.json @@ -0,0 +1,11 @@ +{ + "dependencies": { + "appInsights1": { + "type": "appInsights" + }, + "storage1": { + "type": "storage", + "connectionId": "AzureWebJobsStorage" + } + } +} \ No newline at end of file diff --git a/src/PetProjectAzFunctions/host.json b/src/PetProjectAzFunctions/host.json new file mode 100644 index 0000000..ee5cf5f --- /dev/null +++ b/src/PetProjectAzFunctions/host.json @@ -0,0 +1,12 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + }, + "enableLiveMetricsFilters": true + } + } +} \ No newline at end of file diff --git a/src/PetProjectAzFunctions/readme.md b/src/PetProjectAzFunctions/readme.md new file mode 100644 index 0000000..0b247b5 --- /dev/null +++ b/src/PetProjectAzFunctions/readme.md @@ -0,0 +1,11 @@ +# TimerTrigger - C# + +The `TimerTrigger` makes it incredibly easy to have your functions executed on a schedule. This sample demonstrates a simple use case of calling your function every 5 minutes. + +## How it works + +For a `TimerTrigger` to work, you provide a schedule in the form of a [cron expression](https://en.wikipedia.org/wiki/Cron#CRON_expression)(See the link for full details). A cron expression is a string with 6 separate expressions which represent a given schedule via patterns. The pattern we use to represent every 5 minutes is `0 */5 * * * *`. This, in plain text, means: "When seconds is equal to 0, minutes is divisible by 5, for any hour, day of the month, month, day of the week, or year". + +## Learn more + + Documentation \ No newline at end of file diff --git a/src/Synchronizer/ProblemsProcessor.cs b/src/Synchronizer/ProblemsProcessor.cs index db86026..a1a66d8 100644 --- a/src/Synchronizer/ProblemsProcessor.cs +++ b/src/Synchronizer/ProblemsProcessor.cs @@ -1,5 +1,6 @@ using Microsoft.Azure.Cosmos; using Common.Models; +using Common.DatabaseModels; namespace Synchronizer; diff --git a/src/lcw.sln b/src/lcw.sln index ec43a01..424b481 100644 --- a/src/lcw.sln +++ b/src/lcw.sln @@ -9,10 +9,9 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Backend", "Backend\Backend. EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Synchronizer", "Synchronizer\Synchronizer.csproj", "{BF0FF8B1-3D65-459E-8CA1-A7C0ED4F97B9}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PetProjectAzFunctions", "PetProjectAzFunctions\PetProjectAzFunctions.csproj", "{31C50D63-3018-4679-92FD-F080D47A32D0}" +EndProject Global - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Debug|x64 = Debug|x64 @@ -58,5 +57,20 @@ Global {BF0FF8B1-3D65-459E-8CA1-A7C0ED4F97B9}.Release|x64.Build.0 = Release|Any CPU {BF0FF8B1-3D65-459E-8CA1-A7C0ED4F97B9}.Release|x86.ActiveCfg = Release|Any CPU {BF0FF8B1-3D65-459E-8CA1-A7C0ED4F97B9}.Release|x86.Build.0 = Release|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Debug|Any CPU.Build.0 = Debug|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Debug|x64.ActiveCfg = Debug|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Debug|x64.Build.0 = Debug|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Debug|x86.ActiveCfg = Debug|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Debug|x86.Build.0 = Debug|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Release|Any CPU.ActiveCfg = Release|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Release|Any CPU.Build.0 = Release|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Release|x64.ActiveCfg = Release|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Release|x64.Build.0 = Release|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Release|x86.ActiveCfg = Release|Any CPU + {31C50D63-3018-4679-92FD-F080D47A32D0}.Release|x86.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE EndGlobalSection EndGlobal From 8107cf7dbb585d5553873027850ec12a9816e5b6 Mon Sep 17 00:00:00 2001 From: devidnyk Date: Sat, 25 Oct 2025 19:52:44 +0530 Subject: [PATCH 7/7] Added basic background process controller and added basic scrapper --- src/Backend/Controllers/AdminController.cs | 69 +++++++++++ .../Controllers/JobSearchController.cs | 16 +-- .../Controllers/ScrapperSettingsController.cs | 26 ++-- src/Backend/Operations/ScrapperRunner.cs | 117 ++++++++++++++++++ src/Backend/Program.cs | 1 + src/Backend/Views/JobListView.cs | 54 ++++++++ src/Backend/appsettings.json | 2 +- .../DatabaseModels/JobScrapperSettings.cs | 2 +- src/Common/DatabaseModels/QuerySettings.cs | 4 +- src/Common/Engines/GSEngine.cs | 4 +- src/Common/Managers/JobScrapper.cs | 12 +- .../Managers/JobScrapperSettingsManager.cs | 68 +++++----- src/Common/Models/Public/QuerySettings.cs | 2 +- .../JobScrapperSettingsRepository.cs | 2 +- src/Common/Repositories/JobsRepository.cs | 9 ++ 15 files changed, 327 insertions(+), 61 deletions(-) create mode 100644 src/Backend/Controllers/AdminController.cs create mode 100644 src/Backend/Operations/ScrapperRunner.cs create mode 100644 src/Backend/Views/JobListView.cs diff --git a/src/Backend/Controllers/AdminController.cs b/src/Backend/Controllers/AdminController.cs new file mode 100644 index 0000000..dad6987 --- /dev/null +++ b/src/Backend/Controllers/AdminController.cs @@ -0,0 +1,69 @@ + +namespace Backend.Controllers +{ + using Backend.Operations; + using Common.Managers; + using Microsoft.AspNetCore.Mvc; + + [ApiController] + [Route("api/admin")] + public class AdminController : ControllerBase + { + private readonly ILogger logger; + private readonly ScrapperRunner scrapperRunner; + + public AdminController(ILogger logger, ScrapperRunner scrapperRunner) + { + this.logger = logger; + this.scrapperRunner = scrapperRunner; + } + + [HttpGet] + [Route("scrappers/trigger/{scrapperId}")] + public ActionResult TriggerScrapperRun(string scrapperId) + { + // _ = Task.Run(async () => await scrapperRunner.RunScrapperAsync(scrapperId)); + return Ok($"[Dummy]: Scrapper run triggered for id: {scrapperId}"); + } + + [HttpPut] + [Route("scrappers/trigger/{scrapperId}")] + public ActionResult EnableScrapper(string scrapperId) + { + this.scrapperRunner.EnableScrapper(scrapperId); + return Ok($"Scrapper enabled for id: {scrapperId}"); + } + + [HttpDelete] + [Route("scrappers/trigger/{scrapperId}")] + public ActionResult DisableScrapper(string scrapperId) + { + this.scrapperRunner.DisableScrapper(scrapperId); + return Ok($"Scrapper disabled for id: {scrapperId}"); + } + + [HttpGet] + [Route("scrappers/background/start")] + public ActionResult StartScrappersInBackground() + { + this.scrapperRunner.StartBackgroundRunner(); + return Ok($"Background scrapper runs started. Current State: {this.scrapperRunner.CurrentState}"); + } + + [HttpGet] + [Route("scrappers/background/stop")] + public ActionResult StopScrappersInBackground() + { + this.scrapperRunner.StopBackgroundRunner(); + return Ok($"Background scrapper runs stopped. Current State: {this.scrapperRunner.CurrentState}"); + } + + [HttpGet] + [Route("scrappers/background/status")] + public ActionResult GetScrappersInBackgroundStatus() + { + this.scrapperRunner.StopBackgroundRunner(); + return Ok($"{this.scrapperRunner.GetStatus()}"); + } + } +} \ No newline at end of file diff --git a/src/Backend/Controllers/JobSearchController.cs b/src/Backend/Controllers/JobSearchController.cs index 2ced804..7a396e4 100644 --- a/src/Backend/Controllers/JobSearchController.cs +++ b/src/Backend/Controllers/JobSearchController.cs @@ -11,7 +11,7 @@ namespace Backend.Controllers using Common.DatabaseModels; [ApiController] - [Route("api")] + [Route("api/jobs")] public class JobSearchController : ControllerBase { private readonly JobsRepository jobsRepository; @@ -23,28 +23,30 @@ public JobSearchController(JobsRepository jobsRepository, ILogger>> SearchJobs([FromBody] JobQuery jobquery) { return Ok(await jobsRepository.GetJobsFromQuery(jobquery)); } [HttpGet] - [Route("jobs/latest")] - public async Task> GetLatestJobsFromDb() + [Route("latest")] + public async Task> GetLatestJobsFromDb( + [FromQuery] string location = "India", + [FromQuery] string level = "Mid") { - return Ok(await this.jobsRepository.GetAllLatestJobsAsync()); + return Content(JobListView.RenderScrappedJobsHtml(await this.jobsRepository.GetJobsEasyQueryAsync(location, level)), "text/html"); } [HttpGet] - [Route("jobs/lastOneDay")] + [Route("lastOneDay")] public async Task> GetLastOneDayJobsFromDb() { return Ok(await this.jobsRepository.GetAllJobsInLastOneDay()); } [HttpGet] - [Route("jobs/profile/{id}")] + [Route("profile/{id}")] public async Task> GetJobById(string id) { var job = await this.jobsRepository.GetJobByIdAsync(id); diff --git a/src/Backend/Controllers/ScrapperSettingsController.cs b/src/Backend/Controllers/ScrapperSettingsController.cs index 3cbdc4b..4192716 100644 --- a/src/Backend/Controllers/ScrapperSettingsController.cs +++ b/src/Backend/Controllers/ScrapperSettingsController.cs @@ -9,14 +9,14 @@ namespace Backend.Controllers { [ApiController] - [Route("api/[controller]")] + [Route("api/jobs/scrappers")] public class ScrapperSettingsController : ControllerBase { private readonly JobScrapperSettingsManager _settingsManager; private readonly ILogger _logger; - public ScrapperSettingsController( JobScrapperSettingsManager jobScrapperSettingsManager, + public ScrapperSettingsController(JobScrapperSettingsManager jobScrapperSettingsManager, ILogger logger) { _settingsManager = jobScrapperSettingsManager; @@ -24,7 +24,7 @@ public ScrapperSettingsController( JobScrapperSettingsManager jobScrapperSetting } [HttpGet] - [Route("jobs/scrappers")] + [Route("")] public async Task>> GetAllJobScrappers() { // Placeholder implementation for getting all scrappers @@ -32,23 +32,29 @@ public async Task>> GetAllJobScrappers() } [HttpPut] - [Route("jobs/scrappers/{id}")] + [Route("{id}")] public async Task> UpdateJobScrapperSettings(string id, [FromBody] ScrapperSettings settings) { - // Placeholder implementation for updating scrapper settings - return Ok(await _settingsManager.CreateOrUpdateSettings(id, settings)); + try + { + return Ok(await _settingsManager.CreateOrUpdateSettings(id, settings)); + } + catch (InvalidOperationException ex) + { + return BadRequest(ex.Message); + } } [HttpPost] - [Route("jobs/scrappers/Add")] + [Route("add")] public async Task> CreateNewJobScrapperSettings([FromBody] ScrapperSettings settings) { - // Placeholder implementation for updating scrapper settings - return Ok(await _settingsManager.CreateOrUpdateSettings(string.Empty, settings)); + return BadRequest("Use PUT api/jobs/scrappers/{id} to create or update scrapper settings."); + // return Ok(await _settingsManager.CreateOrUpdateSettings(string.Empty, settings)); } [HttpGet] - [Route("jobs/scrappers/{id}")] + [Route("{id}")] public async Task> GetJobScrapperSettings(string id) { // Placeholder implementation for getting scrapper settings diff --git a/src/Backend/Operations/ScrapperRunner.cs b/src/Backend/Operations/ScrapperRunner.cs new file mode 100644 index 0000000..79d8f2c --- /dev/null +++ b/src/Backend/Operations/ScrapperRunner.cs @@ -0,0 +1,117 @@ +using System.Collections.Concurrent; +using System.Text; +using Common.Engines; +using Common.Managers; +using Common.Repositories; + +namespace Backend.Operations +{ + public class ScrapperRunner + { + ILogger logger; + GSEngine gsEngine; + AIEngine aiEngine; + JobsRepository jobsRepository; + JobScrapperSettingsManager settingsManager; + + private ConcurrentBag enabledScrappers = new ConcurrentBag(); + private TimeSpan runInterval = TimeSpan.FromHours(3); + private CancellationTokenSource cts = new CancellationTokenSource(); + private Task backgroundTask = null; + public string CurrentState { get; private set; } = "Stopped"; + private string LastError = string.Empty; + private DateTime lastRunTime = DateTime.MinValue; + + public ScrapperRunner(ILogger logger, JobScrapperSettingsManager settingsManager, GSEngine gSEngine, AIEngine aIEngine, JobsRepository jobsRepository) + { + this.logger = logger; + this.gsEngine = gSEngine; + this.aiEngine = aIEngine; + this.jobsRepository = jobsRepository; + this.settingsManager = settingsManager; + } + + public void EnableScrapper(string scrapperId) + { + if (!enabledScrappers.Contains(scrapperId)) + { + enabledScrappers.Add(scrapperId); + } + } + + public void DisableScrapper(string scrapperId) + { + enabledScrappers = new ConcurrentBag(enabledScrappers.Except(new List { scrapperId })); + } + + + public async Task RunScrapperAsync(string scrapperId) + { + var settings = await this.settingsManager.GetSettingsById(scrapperId); + if (settings == null) + { + logger.LogWarning($"Scrapper settings not found for id: {scrapperId}. Skipping scrapper run."); + return; + } + + try + { + var scrapper = new JobScrapper(gsEngine, aiEngine, jobsRepository, logger); + scrapper.ConfigureSettings(settings); + await scrapper.RunAsync(); + logger.LogInformation($"Scrapper run completed for id: {scrapperId}"); + settings.lastRunTime = DateTime.UtcNow; + await this.settingsManager.UpdateSettingsAsync(scrapperId, settings); + } + catch (Exception ex) + { + logger.LogError($"Error running scrapper for id: {scrapperId}. Exception: {ex}"); + this.LastError = ex.Message; + } + } + + public void StartBackgroundRunner() + { + if (backgroundTask == null || backgroundTask.IsCompleted) + { + cts = new CancellationTokenSource(); + backgroundTask = RunInBackgroundAsync(cts.Token); + this.CurrentState = "Running"; + } + } + + public void StopBackgroundRunner() + { + if (cts != null && !cts.IsCancellationRequested) + { + cts.Cancel(); + this.CurrentState = "Stopped"; + } + } + + public string GetStatus() + { + var sb = new StringBuilder(); + sb.Append($"CurrentState: {this.CurrentState}\n"); + sb.Append($"AI Engine Ready: {this.aiEngine.IsReady()}\n"); + sb.Append($"Run Interval: {this.runInterval} | Last Run Time (UTC): {this.lastRunTime}\n"); + sb.Append($"EnabledScrappers: {string.Join(",", this.enabledScrappers)}\n"); + sb.Append($"LastError: {this.LastError}"); + return sb.ToString(); + } + + private async Task RunInBackgroundAsync(CancellationToken cancellationToken) + { + while (!cancellationToken.IsCancellationRequested) + { + lastRunTime = DateTime.UtcNow; + foreach (var scrapperId in enabledScrappers) + { + logger.LogInformation($"Starting scrapper run for id: {scrapperId}"); + await RunScrapperAsync(scrapperId); + } + await Task.Delay(runInterval, cancellationToken); + } + } + } +} \ No newline at end of file diff --git a/src/Backend/Program.cs b/src/Backend/Program.cs index 8d434c8..12127ae 100644 --- a/src/Backend/Program.cs +++ b/src/Backend/Program.cs @@ -102,6 +102,7 @@ public static void Main(string[] args) services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); var app = builder.Build(); ILogger logger = app.Logger; diff --git a/src/Backend/Views/JobListView.cs b/src/Backend/Views/JobListView.cs new file mode 100644 index 0000000..d1fdff1 --- /dev/null +++ b/src/Backend/Views/JobListView.cs @@ -0,0 +1,54 @@ +using System.Text; +using Common.DatabaseModels; + +public static class JobListView +{ + public static string RenderScrappedJobsHtml(List jobs) + { + var sb = new StringBuilder(); + + sb.AppendLine(""); + sb.AppendLine(""); + sb.AppendLine(""); + sb.AppendLine(""); + sb.AppendLine(""); + sb.AppendLine("Scrapped Jobs"); + sb.AppendLine(@""); + sb.AppendLine(""); + sb.AppendLine(""); + sb.AppendLine("

Scrapped Job Listings

"); + + foreach (var job in jobs) + { + sb.AppendLine("
"); + sb.AppendLine($" "); + sb.AppendLine($"
{System.Net.WebUtility.HtmlEncode(job.companyName ?? "Unknown")} — {System.Net.WebUtility.HtmlEncode(job.location ?? "N/A")}
"); + sb.AppendLine($"
{System.Net.WebUtility.HtmlEncode(job.snippet ?? "No description available.")}
"); + + if (job.tags != null && job.tags.Count > 0) + { + sb.AppendLine("
"); + foreach (var tag in job.tags) + sb.AppendLine($"{System.Net.WebUtility.HtmlEncode(tag)}"); + sb.AppendLine("
"); + } + + sb.AppendLine($" "); + sb.AppendLine($"
Scrapped: {job.scrappedTime:yyyy-MM-dd HH:mm}
"); + sb.AppendLine("
"); + } + + sb.AppendLine(""); + + return sb.ToString(); + } +} diff --git a/src/Backend/appsettings.json b/src/Backend/appsettings.json index c1e8513..d7bf0c3 100644 --- a/src/Backend/appsettings.json +++ b/src/Backend/appsettings.json @@ -14,7 +14,7 @@ "LCProject:ContainerName": "Problems", "JobProject:DatabaseName": "JobDataBase", "JobProject:ContainerName": "JobDetailsContainer", - "JobProject:ScraperContainerName": "ScraperSettingsContainer" + "JobProject:ScraperContainerName": "ScrapperSettingsContainer" }, "ApplicationInsights": { "LogLevel": { diff --git a/src/Common/DatabaseModels/JobScrapperSettings.cs b/src/Common/DatabaseModels/JobScrapperSettings.cs index 17f3c6d..570c209 100644 --- a/src/Common/DatabaseModels/JobScrapperSettings.cs +++ b/src/Common/DatabaseModels/JobScrapperSettings.cs @@ -68,7 +68,7 @@ public ScrapperSettings ToPublicModel() sitesToExclude = this.settings.sitesToExclude, exactTerms = this.settings.exactTerms, negativeTerms = this.settings.negativeTerms, - additionalTerms = this.settings.additionalSearchterms, + additionalSearchTerms = this.settings.additionalSearchTerms, lookBackDays = this.settings.lookBackDays } }; diff --git a/src/Common/DatabaseModels/QuerySettings.cs b/src/Common/DatabaseModels/QuerySettings.cs index c03571a..aa447a6 100644 --- a/src/Common/DatabaseModels/QuerySettings.cs +++ b/src/Common/DatabaseModels/QuerySettings.cs @@ -17,7 +17,7 @@ public class QuerySettings public List exactTerms { get; set; } public List negativeTerms { get; set; } public int lookBackDays { get; set; } = 1; - public List additionalSearchterms { get; set; } + public List additionalSearchTerms { get; set; } public QuerySettings(PublicSettingsModel qs) { @@ -27,7 +27,7 @@ public QuerySettings(PublicSettingsModel qs) sitesToExclude = qs.sitesToExclude; exactTerms = qs.exactTerms; negativeTerms = qs.negativeTerms; - additionalSearchterms = qs.additionalTerms; + additionalSearchTerms = qs.additionalSearchTerms; } } } diff --git a/src/Common/Engines/GSEngine.cs b/src/Common/Engines/GSEngine.cs index 231c96b..f871261 100644 --- a/src/Common/Engines/GSEngine.cs +++ b/src/Common/Engines/GSEngine.cs @@ -83,9 +83,9 @@ public async Task> SearchQueryAsync(JobScrapperSettings settin } // Additional terms (hq) - if (qsettings.additionalSearchterms != null && qsettings.additionalSearchterms.Any()) + if (qsettings.additionalSearchTerms != null && qsettings.additionalSearchTerms.Any()) { - var add = string.Join(" ", qsettings.additionalSearchterms.Where(s => !string.IsNullOrWhiteSpace(s))); + var add = string.Join(" ", qsettings.additionalSearchTerms.Where(s => !string.IsNullOrWhiteSpace(s))); if (!string.IsNullOrWhiteSpace(add)) sb.Append(AddadditionalSearchterms(add)); } diff --git a/src/Common/Managers/JobScrapper.cs b/src/Common/Managers/JobScrapper.cs index f7f9a45..84d6a07 100644 --- a/src/Common/Managers/JobScrapper.cs +++ b/src/Common/Managers/JobScrapper.cs @@ -40,13 +40,21 @@ public async Task RunAsync() return; } - var mp = searchResults.ToDictionary(j => j.id, j => j); + var mp = new Dictionary(StringComparer.OrdinalIgnoreCase); + foreach (var job in searchResults) + { + if (!mp.ContainsKey(job.id)) + { + mp[job.id] = job; + } + } + var levels = await this.aiEngine.GetJobLevelAsync(searchResults); foreach (var level in levels) { if (mp.ContainsKey(level.Key)) { - mp[level.Key].tags.Add(level.Value); + mp[level.Key].tags.AddRange(level.Value.Split("-")); } else { diff --git a/src/Common/Managers/JobScrapperSettingsManager.cs b/src/Common/Managers/JobScrapperSettingsManager.cs index 3d434f7..2c9f48f 100644 --- a/src/Common/Managers/JobScrapperSettingsManager.cs +++ b/src/Common/Managers/JobScrapperSettingsManager.cs @@ -25,51 +25,36 @@ public async Task CreateOrUpdateSettings(string id, Scrappe throw new ArgumentNullException(nameof(publicSettings), "Public settings cannot be null"); } - var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT TOP 1* from ScraperSettingsContainer where Id = {id}"); - - int count = 0; - var existingSettingsList = new List(); - var returnSettings = default(JobScrapperSettings); - while (settingsInDb.HasMoreResults) + var settingsInDb = await this.GetAllSettings(); + JobScrapperSettings current = null; + if(!string.IsNullOrEmpty(id) && settingsInDb.Any(s => s.id.Equals(id, StringComparison.OrdinalIgnoreCase))) { - var response = await settingsInDb.ReadNextAsync(); - existingSettingsList.AddRange(response); + current = settingsInDb.First(s => s.id.Equals(id, StringComparison.OrdinalIgnoreCase)); } - if(count > 0) + if (current != null) { - var existingSettings = existingSettingsList[0]; - existingSettings.UpdateFromPublicModel(publicSettings); - await _scrapperSettingsContainer.ReplaceItemAsync( - existingSettings, - existingSettings.id - ); - returnSettings = existingSettings; + current.UpdateFromPublicModel(publicSettings); } else - { - id = Guid.NewGuid().ToString(); - returnSettings = await _scrapperSettingsContainer.CreateItemAsync( - new JobScrapperSettings( - id, - publicSettings.name, - publicSettings.runIntervalInMinutes, - publicSettings.settings, - true) - ); + { // TODO: Restrict total number of settings to 5 + if (settingsInDb.Count >= 5) + { + throw new InvalidOperationException("[TooManySettings]: Cannot create more than 5 scrapper settings."); + } + current = new JobScrapperSettings(id, publicSettings.name, publicSettings.runIntervalInMinutes, publicSettings.settings, true); } - - return returnSettings; + + await _scrapperSettingsContainer.UpsertItemAsync(current); + return current; } public async Task GetSettingsById(string id) { - var setting = await _scrapperSettingsContainer.ReadItemAsync( - id, - new PartitionKey(id) - ); + var allSettings = await GetAllSettings(); + var setting = allSettings.FirstOrDefault(s => s.id.Equals(id, StringComparison.OrdinalIgnoreCase)); - if(setting == null) + if (setting == null) { _logger.LogError($"No JobScrapperSettings found with id: {id}"); throw new KeyNotFoundException($"No JobScrapperSettings found with id: {id}"); @@ -77,10 +62,25 @@ public async Task GetSettingsById(string id) return setting; } + + public async Task UpdateSettingsAsync(string id, JobScrapperSettings jobSetting) + { + try + { + await _scrapperSettingsContainer.UpsertItemAsync(jobSetting, new PartitionKey(id)); + _logger.LogInformation($"Successfully updated JobScrapperSettings with id: {id}"); + } + catch (Exception ex) + { + _logger.LogError($"Error updating JobScrapperSettings with id: {id}. Exception: {ex.Message}"); + return false; + } + return true; + } public async Task> GetAllSettings() { - var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT * from ScraperSettingsContainer"); + var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT * from c"); var allSettings = new List(); while (settingsInDb.HasMoreResults) { diff --git a/src/Common/Models/Public/QuerySettings.cs b/src/Common/Models/Public/QuerySettings.cs index 682021b..4bb8415 100644 --- a/src/Common/Models/Public/QuerySettings.cs +++ b/src/Common/Models/Public/QuerySettings.cs @@ -9,7 +9,7 @@ public class QuerySettings public List sitesToExclude { get; set; } public List exactTerms { get; set; } public List negativeTerms { get; set; } - public List additionalTerms { get; set; } + public List additionalSearchTerms { get; set; } public int lookBackDays { get; set; } } } diff --git a/src/Common/Repositories/JobScrapperSettingsRepository.cs b/src/Common/Repositories/JobScrapperSettingsRepository.cs index 8b2edb0..c997470 100644 --- a/src/Common/Repositories/JobScrapperSettingsRepository.cs +++ b/src/Common/Repositories/JobScrapperSettingsRepository.cs @@ -26,7 +26,7 @@ public JobScrapperSettingsRepository(ICosmosContainerFactory cosmosContainerFact public async Task> GetAllSettings() { - var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT * from JobScrapperSettings"); + var settingsInDb = _scrapperSettingsContainer.GetItemQueryIterator($"SELECT * from c"); var allSettings = new List(); while (settingsInDb.HasMoreResults) { diff --git a/src/Common/Repositories/JobsRepository.cs b/src/Common/Repositories/JobsRepository.cs index 3086fdb..b2b2c7a 100644 --- a/src/Common/Repositories/JobsRepository.cs +++ b/src/Common/Repositories/JobsRepository.cs @@ -86,6 +86,15 @@ public async Task CreateIfNotExistsAsync(ScrappedJob job) } } + public async Task> GetJobsEasyQueryAsync(string location, string level) + { + var query = "SELECT * FROM c WHERE EXISTS ( SELECT VALUE t FROM t IN c.tags WHERE CONTAINS(LOWER(t), @location) OR CONTAINS(LOWER(t), @unknown) ) ORDER BY c.scrappedTime DESC OFFSET 0 LIMIT 1000"; + var queryDefinition = new QueryDefinition(query).WithParameter("@location", location.ToLower()).WithParameter("@unknown", "unknown"); + var res = await QueryJobsAsync(queryDefinition); + res = res.Where(j => j.tags.Any(t => t.Equals(level, StringComparison.OrdinalIgnoreCase))).ToList(); + return res; + } + private async Task> QueryJobsAsync(string query) {